diff --git a/.clang-format b/.clang-format index 2e606ba4bb..1defc175de 100644 --- a/.clang-format +++ b/.clang-format @@ -2,7 +2,8 @@ BasedOnStyle: Google IndentWidth: 2 -ContinuationIndentWidth: 2 +ColumnLimit: 80 +ContinuationIndentWidth: 4 UseTab: Never MaxEmptyLinesToKeep: 2 @@ -34,4 +35,5 @@ BinPackArguments: true BinPackParameters: true ConstructorInitializerAllOnOneLineOrOnePerLine: false -IndentCaseLabels: true \ No newline at end of file +IndentCaseLabels: true + diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..df06a0e5fb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,24 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Description** +A clear and concise description of what the bug is. + +**Triton Information** +What version of Triton are you using? + +Are you using the Triton container or did you build it yourself? + +**To Reproduce** +Steps to reproduce the behavior. + +Describe the models (framework, inputs, outputs), ideally include the model configuration file (if using an ensemble include the model configuration file for that as well). + +**Expected behavior** +A clear and concise description of what you expected to happen. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..bbcbbe7d61 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template_external_contrib.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template_external_contrib.md new file mode 100644 index 0000000000..4f7afde5fa --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template_external_contrib.md @@ -0,0 +1,50 @@ +#### What does the PR do? + + +#### Checklist +- [ ] I have read the [Contribution guidelines](#../../CONTRIBUTING.md) and signed the [Contributor License +Agreement](https://github.com/NVIDIA/triton-inference-server/blob/master/Triton-CCLA-v1.pdf) +- [ ] PR title reflects the change and is of format `: ` +- [ ] Changes are described in the pull request. +- [ ] Related issues are referenced. +- [ ] Populated [github labels](https://docs.github.com/en/issues/using-labels-and-milestones-to-track-work/managing-labels) field +- [ ] Added [test plan](#test-plan) and verified test passes. +- [ ] Verified that the PR passes existing CI. +- [ ] I ran pre-commit locally (`pre-commit install, pre-commit run --all`) +- [ ] Verified copyright is correct on all changed files. +- [ ] Added _succinct_ git squash message before merging [ref](https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html). +- [ ] All template sections are filled out. +- [ ] Optional: Additional screenshots for behavior/output changes with before/after. + +#### Commit Type: +Check the [conventional commit type](https://github.com/angular/angular/blob/22b96b9/CONTRIBUTING.md#type) +box here and add the label to the github PR. +- [ ] build +- [ ] ci +- [ ] docs +- [ ] feat +- [ ] fix +- [ ] perf +- [ ] refactor +- [ ] revert +- [ ] style +- [ ] test + +#### Related PRs: +<!-- Related PRs from other Repositories --> + +#### Where should the reviewer start? +<!-- call out specific files that should be looked at closely --> + +#### Test plan: +<!-- list steps to verify feature works --> +<!-- were e2e tests added?--> + +#### Caveats: +<!-- any limitations or possible things missing from this PR --> + +#### Background +<!-- e.g. what led to this change being made. this is optional extra information to help the reviewer --> + +#### Related Issues: (use one of the action keywords Closes / Fixes / Resolves / Relates to) +- closes GitHub issue: #xxx diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template_internal_contrib.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template_internal_contrib.md new file mode 100644 index 0000000000..b1e520b4aa --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template_internal_contrib.md @@ -0,0 +1,50 @@ +#### What does the PR do? +<!-- Describe your pull request here. Please read the text below the line, and make sure you follow the checklist.--> + +#### Checklist +- [ ] PR title reflects the change and is of format `<commit_type>: <Title>` +- [ ] Changes are described in the pull request. +- [ ] Related issues are referenced. +- [ ] Populated [github labels](https://docs.github.com/en/issues/using-labels-and-milestones-to-track-work/managing-labels) field +- [ ] Added [test plan](#test-plan) and verified test passes. +- [ ] Verified that the PR passes existing CI. +- [ ] Verified copyright is correct on all changed files. +- [ ] Added _succinct_ git squash message before merging [ref](https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html). +- [ ] All template sections are filled out. +- [ ] Optional: Additional screenshots for behavior/output changes with before/after. + +#### Commit Type: +Check the [conventional commit type](https://github.com/angular/angular/blob/22b96b9/CONTRIBUTING.md#type) +box here and add the label to the github PR. +- [ ] build +- [ ] ci +- [ ] docs +- [ ] feat +- [ ] fix +- [ ] perf +- [ ] refactor +- [ ] revert +- [ ] style +- [ ] test + +#### Related PRs: +<!-- Related PRs from other Repositories --> + +#### Where should the reviewer start? +<!-- call out specific files that should be looked at closely --> + +#### Test plan: +<!-- list steps to verify --> +<!-- were e2e tests added?--> + +- CI Pipeline ID: +<!-- Only Pipeline ID and no direct link here --> + +#### Caveats: +<!-- any limitations or possible things missing from this PR --> + +#### Background +<!-- e.g. what led to this change being made. this is optional extra information to help the reviewer --> + +#### Related Issues: (use one of the action keywords Closes / Fixes / Resolves / Relates to) +- closes GitHub issue: #xxx diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000..0787dcbc60 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,13 @@ +Thanks for submitting a PR to Triton! +Please go the the `Preview` tab above this description box and select the appropriate sub-template: + +* [PR description template for Triton Engineers](?expand=1&template=pull_request_template_internal_contrib.md) +* [PR description template for External Contributors](?expand=1&template=pull_request_template_external_contrib.md) + +If you already created the PR, please replace this message with one of +* [External contribution template](https://raw.githubusercontent.com/triton-inference-server/server/main/.github/PULL_REQUEST_TEMPLATE/pull_request_template_external_contrib.md) +* [Internal contribution template](https://raw.githubusercontent.com/triton-inference-server/server/main/.github/PULL_REQUEST_TEMPLATE/pull_request_template_internal_contrib.md) + +and fill it out. + + diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..745a33730b --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,84 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "CodeQL" + +on: + pull_request: + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # Details on CodeQL's query packs refer to: + # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + queries: +security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # Command-line programs to run using the OS shell. + # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 0000000000..531cc2911b --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,39 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: pre-commit + +on: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.0 + diff --git a/.gitignore b/.gitignore index 4e1f8ef0cc..f1b69cb25e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,8 @@ -/bazel-bin -/bazel-ci_build-cache -/bazel-genfiles -/bazel-trtserver -/bazel-out -/bazel-serving -/bazel-tensorflow -/bazel-tensorflow_serving -/bazel-testlogs -/bazel-tf -/bazel-workspace +/build +/builddir +/.vscode +*.so +__pycache__ +tmp +*.log +test_results.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..f44f815351 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,74 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +repos: +- repo: https://github.com/timothycrosley/isort + rev: 5.12.0 + hooks: + - id: isort + additional_dependencies: [toml] +- repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + types_or: [python, cython] +- repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] + types_or: [python, cython] +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v16.0.5 + hooks: + - id: clang-format + types_or: [c, c++, cuda, proto, textproto, java] + args: ["-fallback-style=none", "-style=file", "-i"] +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] + exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) +# More details about these pre-commit hooks here: +# https://pre-commit.com/hooks.html +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-json + - id: check-toml + - id: check-yaml + exclude: ^deploy(\/[^\/]+)*\/templates\/.*$ + - id: check-shebang-scripts-are-executable + - id: end-of-file-fixer + types_or: [c, c++, cuda, proto, textproto, java, python] + - id: mixed-line-ending + - id: requirements-txt-fixer + - id: trailing-whitespace diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..f8fb8d09fb --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,7 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +title: "Triton Inference Server: An Optimized Cloud and Edge Inferencing Solution." +url: https://github.com/triton-inference-server +repository-code: https://github.com/triton-inference-server/server +authors: + - name: "NVIDIA Corporation" diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..56cb346dc0 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,269 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.18) + +project(tritonserver LANGUAGES C CXX) + +include(CMakeDependentOption) + +# Use C++17 standard as Triton's minimum required. +set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.") + +set(TRITON_VERSION "0.0.0" CACHE STRING "The version of the Triton shared library" ) + +option(TRITON_ENABLE_LOGGING "Include logging support in server" ON) +option(TRITON_ENABLE_STATS "Include statistics collections in server" ON) +option(TRITON_ENABLE_TRACING "Include tracing support in server" OFF) +option(TRITON_ENABLE_NVTX "Include NVTX support in server" OFF) +option(TRITON_ENABLE_GPU "Enable GPU support in server" ON) +option(TRITON_ENABLE_MALI_GPU "Enable Arm Mali GPU support in server" OFF) +option(TRITON_IGPU_BUILD "Enable options for iGPU compilation in sever" OFF) +set(TRITON_MIN_COMPUTE_CAPABILITY "6.0" CACHE STRING + "The minimum CUDA compute capability supported by Triton" ) +set(TRITON_EXTRA_LIB_PATHS "" CACHE PATH "Extra library paths for Triton Server build") + +# Ensemble +option(TRITON_ENABLE_ENSEMBLE "Include ensemble support in server" OFF) + +# Endpoints +option(TRITON_ENABLE_HTTP "Include HTTP API in server" ON) +option(TRITON_ENABLE_GRPC "Include GRPC API in server" ON) +option(TRITON_ENABLE_SAGEMAKER "Include AWS SageMaker API in server" OFF) +option(TRITON_ENABLE_VERTEX_AI "Include Vertex AI API in server" OFF) + +# Metrics +option(TRITON_ENABLE_METRICS "Include metrics support in server" ON) +option(TRITON_ENABLE_METRICS_GPU "Include GPU metrics support in server" ON) +option(TRITON_ENABLE_METRICS_CPU "Include CPU metrics support in server" ON) + +# Cloud storage +option(TRITON_ENABLE_GCS "Include GCS Filesystem support in server" OFF) +option(TRITON_ENABLE_S3 "Include S3 Filesystem support in server" OFF) +option(TRITON_ENABLE_AZURE_STORAGE "Include Azure Storage Filesystem support in server" OFF) + +# Need to know if TensorRT is available when building unit tests +option(TRITON_ENABLE_TENSORRT "Include TensorRT backend in server" OFF) + +# ASAN +option(TRITON_ENABLE_ASAN "Build with address sanitizer" OFF) + +# Repo tags +set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from") +set(TRITON_THIRD_PARTY_REPO_TAG "main" CACHE STRING + "Tag for triton-inference-server/third_party repo") +set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") +set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") +set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") + +# Third-party location +set(TRITON_THIRD_PARTY_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/third-party" CACHE STRING "Location of third-party build") +set(TRITON_THIRD_PARTY_SRC_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/third-party-src" CACHE STRING "Location of third-party source") + +if(TRITON_ENABLE_METRICS AND NOT TRITON_ENABLE_STATS) + message(FATAL_ERROR "TRITON_ENABLE_METRICS=ON requires TRITON_ENABLE_STATS=ON") +endif() + +if(TRITON_ENABLE_TRACING AND NOT TRITON_ENABLE_STATS) + message(FATAL_ERROR "TRITON_ENABLE_TRACING=ON requires TRITON_ENABLE_STATS=ON") +endif() + +if (TRITON_ENABLE_METRICS_CPU AND NOT TRITON_ENABLE_METRICS) + message(FATAL_ERROR "TRITON_ENABLE_METRICS_CPU=ON requires TRITON_ENABLE_METRICS=ON") +endif() + +if (TRITON_ENABLE_METRICS_GPU AND NOT TRITON_ENABLE_METRICS) + message(FATAL_ERROR "TRITON_ENABLE_METRICS_GPU=ON requires TRITON_ENABLE_METRICS=ON") +endif() + +if (TRITON_ENABLE_METRICS_GPU AND NOT TRITON_ENABLE_GPU) + message(FATAL_ERROR "TRITON_ENABLE_METRICS_GPU=ON requires TRITON_ENABLE_GPU=ON") +endif() + +if(TRITON_ENABLE_ASAN AND TRITON_ENABLE_GPU) + message(FATAL_ERROR "TRITON_ENABLE_ASAN=ON requires TRITON_ENABLE_GPU=OFF") +endif() + +# +# Dependencies +# +include(FetchContent) + +FetchContent_Declare( + repo-core + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git + GIT_TAG ${TRITON_CORE_REPO_TAG} +) +FetchContent_Declare( + repo-third-party + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/third_party.git + GIT_TAG ${TRITON_THIRD_PARTY_REPO_TAG} +) + +# Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead +# of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos +set(LIB_DIR "lib") +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set (LIB_DIR "lib64") + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) +set(TRITON_CORE_HEADERS_ONLY OFF) + +FetchContent_MakeAvailable(repo-third-party repo-core) + +# +# Triton server executable and examples +# + +# Need to use ExternalProject for our builds so that we can get the +# correct dependencies between Triton executable and the +# ExternalProject dependencies (found in the third_party repo) +include(ExternalProject) + +# If CMAKE_TOOLCHAIN_FILE is set, propagate that hint path to the external +# projects. +set(_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE "") +if (CMAKE_TOOLCHAIN_FILE) + set(_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE "-DCMAKE_TOOLCHAIN_FILE:PATH=${CMAKE_TOOLCHAIN_FILE}") +endif() + +# If VCPKG_TARGET_TRIPLET is set, propagate that hint path to the external +# projects. +set(_CMAKE_ARGS_VCPKG_TARGET_TRIPLET "") +if (VCPKG_TARGET_TRIPLET) + set(_CMAKE_ARGS_VCPKG_TARGET_TRIPLET "-DVCPKG_TARGET_TRIPLET:STRING=${VCPKG_TARGET_TRIPLET}") +endif() + +# If OPENSSL_ROOT_DIR is set, propagate that hint path to the external +# projects with OpenSSL dependency. +set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "") +if (OPENSSL_ROOT_DIR) + set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "-DOPENSSL_ROOT_DIR:PATH=${OPENSSL_ROOT_DIR}") +endif() + +# Location where protobuf-config.cmake will be installed varies by +# platform +if (WIN32) + set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/protobuf/cmake") +else() + set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/protobuf/${LIB_DIR}/cmake/protobuf") +endif() + +# Triton with Opentelemetry is not supported on Windows +# FIXME: add location for Windows, when support is added +# JIRA DLIS-4786 +if (WIN32) + set(_FINDPACKAGE_OPENTELEMETRY_CONFIG_DIR "") +else() + set(_FINDPACKAGE_OPENTELEMETRY_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/opentelemetry-cpp/${LIB_DIR}/cmake/opentelemetry-cpp") +endif() + +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + set(TRITON_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/install) +else() + set(TRITON_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}) +endif() + +set(TRITON_DEPENDS triton-core protobuf googletest re2) +if(${TRITON_ENABLE_GCS}) + set(TRITON_DEPENDS ${TRITON_DEPENDS} google-cloud-cpp) +endif() # TRITON_ENABLE_GCS +if(${TRITON_ENABLE_S3}) + set(TRITON_DEPENDS ${TRITON_DEPENDS} aws-sdk-cpp) +endif() # TRITON_ENABLE_S3 +if(${TRITON_ENABLE_HTTP} OR ${TRITON_ENABLE_METRICS} OR ${TRITON_ENABLE_SAGEMAKER} OR ${TRITON_ENABLE_VERTEX_AI}) + set(TRITON_DEPENDS ${TRITON_DEPENDS} libevent libevhtp) +endif() # TRITON_ENABLE_HTTP || TRITON_ENABLE_METRICS || TRITON_ENABLE_SAGEMAKER || TRITON_ENABLE_VERTEX_AI +if(${TRITON_ENABLE_GRPC}) + set(TRITON_DEPENDS ${TRITON_DEPENDS} grpc) +endif() # TRITON_ENABLE_GRPC +if(NOT WIN32 AND ${TRITON_ENABLE_TRACING}) + set(TRITON_DEPENDS ${TRITON_DEPENDS} opentelemetry-cpp) +endif() # TRITON_ENABLE_TRACING + +ExternalProject_Add(triton-server + PREFIX triton-server + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/triton-server" + CMAKE_CACHE_ARGS + -DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR} + ${_CMAKE_ARGS_OPENSSL_ROOT_DIR} + ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE} + ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET} + -DGTEST_ROOT:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/googletest + -DgRPC_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/grpc/lib/cmake/grpc + -Dc-ares_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/c-ares/${LIB_DIR}/cmake/c-ares + -Dre2_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/re2/${LIB_DIR}/cmake/re2 + -Dabsl_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/absl/${LIB_DIR}/cmake/absl + -DCURL_DIR:STRING=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/curl/${LIB_DIR}/cmake/CURL + -Dnlohmann_json_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/nlohmann_json/${LIB_DIR}/cmake/nlohmann_json + -DLibevent_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/libevent/lib/cmake/libevent + -Dlibevhtp_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/libevhtp/lib/cmake/libevhtp + -Dstorage_client_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/google-cloud-cpp/${LIB_DIR}/cmake/storage_client + -Dgoogle_cloud_cpp_common_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/google-cloud-cpp/${LIB_DIR}/cmake/google_cloud_cpp_common + -DCrc32c_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/crc32c/${LIB_DIR}/cmake/Crc32c + -DAWSSDK_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/cmake/AWSSDK + -Daws-cpp-sdk-core_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/cmake/aws-cpp-sdk-core + -Daws-cpp-sdk-s3_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/cmake/aws-cpp-sdk-s3 + -Daws-c-event-stream_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/aws-c-event-stream/cmake + -Daws-c-common_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/aws-c-common/cmake + -Daws-checksums_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/aws-checksums/cmake + -Dopentelemetry-cpp_DIR:PATH=${_FINDPACKAGE_OPENTELEMETRY_CONFIG_DIR} + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} + -DTRITON_IGPU_BUILD:BOOL=${TRITON_IGPU_BUILD} + -DTRITON_THIRD_PARTY_REPO_TAG:STRING=${TRITON_THIRD_PARTY_REPO_TAG} + -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} + -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} + -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} + -DTRITON_EXTRA_LIB_PATHS:PATH=${TRITON_EXTRA_LIB_PATHS} + -DTRITON_ENABLE_ASAN:BOOL=${TRITON_ENABLE_ASAN} + -DTRITON_ENABLE_NVTX:BOOL=${TRITON_ENABLE_NVTX} + -DTRITON_ENABLE_TRACING:BOOL=${TRITON_ENABLE_TRACING} + -DTRITON_ENABLE_LOGGING:BOOL=${TRITON_ENABLE_LOGGING} + -DTRITON_ENABLE_STATS:BOOL=${TRITON_ENABLE_STATS} + -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU} + -DTRITON_ENABLE_MALI_GPU:BOOL=${TRITON_ENABLE_MALI_GPU} + -DTRITON_ENABLE_HTTP:BOOL=${TRITON_ENABLE_HTTP} + -DTRITON_ENABLE_SAGEMAKER:BOOL=${TRITON_ENABLE_SAGEMAKER} + -DTRITON_ENABLE_VERTEX_AI:BOOL=${TRITON_ENABLE_VERTEX_AI} + -DTRITON_ENABLE_GRPC:BOOL=${TRITON_ENABLE_GRPC} + -DTRITON_MIN_COMPUTE_CAPABILITY:STRING=${TRITON_MIN_COMPUTE_CAPABILITY} + -DTRITON_ENABLE_METRICS:BOOL=${TRITON_ENABLE_METRICS} + -DTRITON_ENABLE_METRICS_GPU:BOOL=${TRITON_ENABLE_METRICS_GPU} + -DTRITON_ENABLE_METRICS_CPU:BOOL=${TRITON_ENABLE_METRICS_CPU} + -DTRITON_ENABLE_GCS:BOOL=${TRITON_ENABLE_GCS} + -DTRITON_ENABLE_AZURE_STORAGE:BOOL=${TRITON_ENABLE_AZURE_STORAGE} + -DTRITON_ENABLE_S3:BOOL=${TRITON_ENABLE_S3} + -DTRITON_ENABLE_TENSORRT:BOOL=${TRITON_ENABLE_TENSORRT} + -DTRITON_ENABLE_ENSEMBLE:BOOL=${TRITON_ENABLE_ENSEMBLE} + -DTRITON_MIN_CXX_STANDARD:STRING=${TRITON_MIN_CXX_STANDARD} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX:PATH=${TRITON_INSTALL_PREFIX} + -DTRITON_VERSION:STRING=${TRITON_VERSION} + DEPENDS ${TRITON_DEPENDS} +) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 84be37f175..59e0ace975 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,5 @@ <!-- -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +26,43 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> +# Contribution Guidelines + +Contributions that fix documentation errors or that make small changes +to existing code can be contributed directly by following the rules +below and submitting an appropriate PR. + +Contributions intended to add significant new functionality must +follow a more collaborative path described in the following +points. Before submitting a large PR that adds a major enhancement or +extension, be sure to submit a GitHub issue that describes the +proposed change so that the Triton team can provide feedback. + +- As part of the GitHub issue discussion, a design for your change + will be agreed upon. An up-front design discussion is required to + ensure that your enhancement is done in a manner that is consistent + with Triton's overall architecture. + +- The Triton project is spread across multiple repos. The Triton team + will provide guidance about how and where your enhancement should be + implemented. + +- [Testing](docs/customization_guide/test.md) is a critical part of any Triton + enhancement. You should plan on spending significant time on + creating tests for your change. The Triton team will help you to + design your testing so that it is compatible with existing testing + infrastructure. + +- If your enhancement provides a user visible feature then you need to + provide documentation. + # Contribution Rules -- The code style convention is enforced by clang-format. See the - Developer Guide for instructions on how to ensure your contributions - conform. In general please follow the existing conventions in the - relevant file, submodule, module, and project when you add new code - or when you extend/fix existing functionality. +- The code style convention is enforced by clang-format. See below on + how to ensure your contributions conform. In general please follow + the existing conventions in the relevant file, submodule, module, + and project when you add new code or when you extend/fix existing + functionality. - Avoid introducing unnecessary complexity into existing code so that maintainability and readability are preserved. @@ -54,10 +84,10 @@ - Make sure all `L0_*` tests pass: - In the `qa/` directory, there are basic sanity tests scripted in - directories named `L0_...`. See the Testing section in the - Developer Guide for instructions on running these tests. + directories named `L0_...`. See the [Test](docs/customization_guide/test.md) + documentation for instructions on running these tests. -- TensorRT Inference Server's default build assumes recent versions of +- Triton Inference Server's default build assumes recent versions of dependencies (CUDA, TensorFlow, PyTorch, TensorRT, etc.). Contributions that add compatibility with older versions of those dependencies will be considered, but NVIDIA cannot guarantee @@ -66,64 +96,32 @@ - Make sure that you can contribute your work to open source (no license and/or patent conflict is introduced by your code). You need - to [`sign`](#Sign) your commit. + to complete the CLA described below before your PR can be merged. - Thanks in advance for your patience as we review your contributions; we do appreciate them! -<a name="Sign"></a>Sign Your Work --------------- - -We require that all contributors "sign-off" on their commits. This -certifies that the contribution is your original work, or you have -rights to submit it under the same license, or a compatible license. - -Any contribution which contains commits that are not Signed-Off will -not be accepted. - -To sign off on a commit you simply use the `--signoff` (or `-s`) -option when committing your changes: - - $ git commit -s -m "Add cool feature." - -This will append the following to your commit message: - - Signed-off-by: Your Name <your@email.com> - -By doing this you certify the below: - - Developer Certificate of Origin - Version 1.1 - - Copyright (C) 2004, 2006 The Linux Foundation and its contributors. - 1 Letterman Drive - Suite D4700 - San Francisco, CA, 94129 - - Everyone is permitted to copy and distribute verbatim copies of - this license document, but changing it is not allowed. - - - Developer's Certificate of Origin 1.1 - - By making a contribution to this project, I certify that: - - (a) The contribution was created in whole or in part by me and I - have the right to submit it under the open source license - indicated in the file; or - - (b) The contribution is based upon previous work that, to the best - of my knowledge, is covered under an appropriate open source - license and I have the right under that license to submit that - work with modifications, whether created in whole or in part by - me, under the same open source license (unless I am permitted to - submit under a different license), as indicated in the file; or - - (c) The contribution was provided directly to me by some other - person who certified (a), (b) or (c) and I have not modified it. - - (d) I understand and agree that this project and the contribution - are public and that a record of the contribution (including all - personal information I submit with it, including my sign-off) is - maintained indefinitely and may be redistributed consistent with - this project or the open source license(s) involved. +# Coding Convention + +All pull requests are checked against the +[pre-commit hooks](https://github.com/pre-commit/pre-commit-hooks) +located [in the repository's top-level .pre-commit-config.yaml](https://github.com/NVIDIA/triton-inference-server/blob/master/pre-commit-config.yaml). +The hooks do some sanity checking like linting and formatting. +These checks must pass to merge a change. + +To run these locally, you can +[install pre-commit,](https://pre-commit.com/#install) +then run `pre-commit install` inside the cloned repo. When you +commit a change, the pre-commit hooks will run automatically. +If a fix is implemented by a pre-commit hook, adding the file again +and running `git commit` a second time will pass and successfully +commit. + +# Contributor License Agreement (CLA) + +Triton requires that all contributors (or their corporate entity) send +a signed copy of the [Contributor License +Agreement](https://github.com/NVIDIA/triton-inference-server/blob/master/Triton-CCLA-v1.pdf) +to triton-cla@nvidia.com. +*NOTE*: Contributors with no company affiliation can fill `N/A` in the +`Corporation Name` and `Corporation Address` fields. diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index ff42f64f4a..0000000000 --- a/Dockerfile +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# -# Multistage build. -# - -ARG BASE_IMAGE=nvcr.io/nvidia/tensorrtserver:18.11-py3 -ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:18.11-py3 -ARG TENSORFLOW_IMAGE=nvcr.io/nvidia/tensorflow:18.11-py3 - -############################################################################ -## Caffe2 stage: Use PyTorch container to get Caffe2 backend -############################################################################ -FROM ${PYTORCH_IMAGE} AS trtserver_caffe2 - -ARG BUILD_CLIENTS_ONLY=0 - -# We cannot just pull libraries from the PyTorch container... we need -# to: -# - copy over netdef_bundle_c2 interface so it can build with other -# C2 sources -# - need to patch to delegate logging to the inference server. - -# Copy netdef_bundle_c2 into Caffe2 core so it builds into the -# libcaffe2 library. We want netdef_bundle_c2 to build against the -# Caffe2 protobuf since it interfaces with that code. -COPY src/servables/caffe2/netdef_bundle_c2.* \ - /opt/pytorch/pytorch/caffe2/core/ - -# Modify the C2 logging library to delegate logging to the trtserver -# logger. Use a checksum to detect if the C2 logging file has -# changed... if it has need to verify our patch is still valid and -# update the patch/checksum as necessary. -COPY tools/patch/caffe2 /tmp/patch/caffe2 -RUN sha1sum -c /tmp/patch/caffe2/checksums && \ - patch -i /tmp/patch/caffe2/core/logging.cc \ - /opt/pytorch/pytorch/caffe2/core/logging.cc && \ - patch -i /tmp/patch/caffe2/core/logging_is_not_google_glog.h \ - /opt/pytorch/pytorch/caffe2/core/logging_is_not_google_glog.h && \ - patch -i /tmp/patch/caffe2/core/context_gpu.cu \ - /opt/pytorch/pytorch/caffe2/core/context_gpu.cu - -# Build same as in pytorch container... except for the NO_DISTRIBUTED -# line where we turn off features not needed for trtserver -WORKDIR /opt/pytorch -RUN pip uninstall -y torch -RUN bash -c 'if [ "$BUILD_CLIENTS_ONLY" != "1" ]; then \ - cd pytorch && \ - TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5+PTX" \ - CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ - NCCL_INCLUDE_DIR="/usr/include/" \ - NCCL_LIB_DIR="/usr/lib/" \ - NO_DISTRIBUTED=1 NO_TEST=1 NO_MIOPEN=1 USE_OPENCV=OFF USE_LEVELDB=OFF \ - python setup.py install && python setup.py clean; \ - else \ - mkdir -p /opt/conda/lib/python3.6/site-packages/torch/lib; \ - mkdir -p /opt/conda/lib; \ - touch /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_detectron_ops_gpu.so; \ - touch /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2.so; \ - touch /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so; \ - touch /opt/conda/lib/python3.6/site-packages/torch/lib/libc10.so; \ - touch /opt/conda/lib/libmkl_avx2.so; \ - touch /opt/conda/lib/libmkl_core.so; \ - touch /opt/conda/lib/libmkl_def.so; \ - touch /opt/conda/lib/libmkl_gnu_thread.so; \ - touch /opt/conda/lib/libmkl_intel_lp64.so; fi' - -############################################################################ -## Build stage: Build inference server based on TensorFlow container -############################################################################ -FROM ${TENSORFLOW_IMAGE} AS trtserver_build - -ARG TRTIS_VERSION=0.10.0dev -ARG TRTIS_CONTAINER_VERSION=19.01dev -ARG PYVER=3.5 -ARG BUILD_CLIENTS_ONLY=0 - -# The TFServing release branch must match the TF release used by -# TENSORFLOW_IMAGE -ARG TFS_BRANCH=r1.12 - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - automake \ - libcurl3-dev \ - libopencv-dev \ - libopencv-core-dev \ - libtool - -RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ - python$PYVER get-pip.py && \ - rm get-pip.py - -RUN pip install --upgrade setuptools - -# Caffe2 library requirements... -COPY --from=trtserver_caffe2 \ - /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_detectron_ops_gpu.so \ - /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 \ - /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2.so \ - /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 \ - /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so \ - /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 \ - /opt/conda/lib/python3.6/site-packages/torch/lib/libc10.so \ - /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_avx2.so /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_core.so /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_def.so /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_gnu_thread.so /opt/tensorrtserver/lib/ -COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_intel_lp64.so /opt/tensorrtserver/lib/ - -# Copy entire repo into container even though some is not needed for -# build itself... because we want to be able to copyright check on -# files that aren't directly needed for build. -WORKDIR /workspace -RUN rm -fr * -COPY . . - -# Pull the TFS release that matches the version of TF being used. -RUN git clone --single-branch -b ${TFS_BRANCH} https://github.com/tensorflow/serving.git - -# Modify the TF logging library to delegate logging to the trtserver -# logger. Use a checksum to detect if the TF logging file has -# changed... if it has need to verify our patch is still valid and -# update the patch/checksum as necessary. -RUN sha1sum -c tools/patch/tensorflow/checksums && \ - patch -i tools/patch/tensorflow/cc/saved_model/loader.cc \ - /opt/tensorflow/tensorflow/cc/saved_model/loader.cc && \ - patch -i tools/patch/tensorflow/core/platform/default/logging.cc \ - /opt/tensorflow/tensorflow/core/platform/default/logging.cc - -# TFS modifications. Use a checksum to detect if the TFS file has -# changed... if it has need to verify our patch is still valid and -# update the patch/checksum as necessary. -RUN sha1sum -c tools/patch/tfs/checksums && \ - patch -i tools/patch/tfs/model_servers/server_core.cc \ - /workspace/serving/tensorflow_serving/model_servers/server_core.cc && \ - patch -i tools/patch/tfs/sources/storage_path/file_system_storage_path_source.cc \ - /workspace/serving/tensorflow_serving/sources/storage_path/file_system_storage_path_source.cc && \ - patch -i tools/patch/tfs/sources/storage_path/file_system_storage_path_source.h \ - /workspace/serving/tensorflow_serving/sources/storage_path/file_system_storage_path_source.h && \ - patch -i tools/patch/tfs/sources/storage_path/file_system_storage_path_source.proto \ - /workspace/serving/tensorflow_serving/sources/storage_path/file_system_storage_path_source.proto && \ - patch -i tools/patch/tfs/util/retrier.cc \ - /workspace/serving/tensorflow_serving/util/retrier.cc && \ - patch -i tools/patch/tfs/util/BUILD \ - /workspace/serving/tensorflow_serving/util/BUILD && \ - patch -i tools/patch/tfs/util/net_http/server/internal/evhttp_request.cc \ - /workspace/serving/tensorflow_serving/util/net_http/server/internal/evhttp_request.cc && \ - patch -i tools/patch/tfs/util/net_http/server/internal/evhttp_request.h \ - /workspace/serving/tensorflow_serving/util/net_http/server/internal/evhttp_request.h && \ - patch -i tools/patch/tfs/util/net_http/server/public/BUILD \ - /workspace/serving/tensorflow_serving/util/net_http/server/public/BUILD && \ - patch -i tools/patch/tfs/util/net_http/server/public/server_request_interface.h \ - /workspace/serving/tensorflow_serving/util/net_http/server/public/server_request_interface.h && \ - patch -i tools/patch/tfs/workspace.bzl \ - /workspace/serving/tensorflow_serving/workspace.bzl - -ENV TF_NEED_GCP 1 -ENV TF_NEED_S3 1 - -# Build the server, clients and any testing artifacts -RUN (cd /opt/tensorflow && ./nvbuild.sh --python$PYVER --configonly) && \ - (cd tools && mv bazel.rc bazel.orig && \ - cat bazel.orig /opt/tensorflow/.tf_configure.bazelrc > bazel.rc) && \ - bash -c 'if [ "$BUILD_CLIENTS_ONLY" != "1" ]; then \ - bazel build -c opt --config=cuda src/servers/trtserver src/clients/... src/test/...; \ - else \ - bazel build -c opt src/clients/...; \ - fi' && \ - (cd /opt/tensorrtserver && ln -s /workspace/qa qa) && \ - mkdir -p /opt/tensorrtserver/bin && \ - cp bazel-bin/src/clients/c++/image_client /opt/tensorrtserver/bin/. && \ - cp bazel-bin/src/clients/c++/perf_client /opt/tensorrtserver/bin/. && \ - cp bazel-bin/src/clients/c++/simple_client /opt/tensorrtserver/bin/. && \ - mkdir -p /opt/tensorrtserver/lib && \ - cp bazel-bin/src/clients/c++/librequest.so /opt/tensorrtserver/lib/. && \ - cp bazel-bin/src/clients/c++/librequest.a /opt/tensorrtserver/lib/. && \ - mkdir -p /opt/tensorrtserver/pip && \ - bazel-bin/src/clients/python/build_pip /opt/tensorrtserver/pip/. && \ - bash -c 'if [ "$BUILD_CLIENTS_ONLY" != "1" ]; then \ - cp bazel-bin/src/servers/trtserver /opt/tensorrtserver/bin/.; \ - cp bazel-bin/src/test/caffe2plan /opt/tensorrtserver/bin/.; \ - fi' && \ - bazel clean --expunge && \ - rm -rf /root/.cache/bazel && \ - rm -rf /tmp/* - -ENV TENSORRT_SERVER_VERSION ${TRTIS_VERSION} -ENV NVIDIA_TENSORRT_SERVER_VERSION ${TRTIS_CONTAINER_VERSION} -ENV PYVER ${PYVER} - -COPY nvidia_entrypoint.sh /opt/tensorrtserver -ENTRYPOINT ["/opt/tensorrtserver/nvidia_entrypoint.sh"] - -############################################################################ -## Production stage: Create container with just inference server executable -############################################################################ -FROM ${BASE_IMAGE} - -ARG TRTIS_VERSION=0.10.0dev -ARG TRTIS_CONTAINER_VERSION=19.01dev -ARG PYVER=3.5 - -ENV TENSORRT_SERVER_VERSION ${TRTIS_VERSION} -ENV NVIDIA_TENSORRT_SERVER_VERSION ${TRTIS_CONTAINER_VERSION} -LABEL com.nvidia.tensorrtserver.version="${TENSORRT_SERVER_VERSION}" - -ENV LD_LIBRARY_PATH /opt/tensorrtserver/lib:${LD_LIBRARY_PATH} -ENV PATH /opt/tensorrtserver/bin:${PATH} -ENV PYVER ${PYVER} - -ENV TF_ADJUST_HUE_FUSED 1 -ENV TF_ADJUST_SATURATION_FUSED 1 -ENV TF_ENABLE_WINOGRAD_NONFUSED 1 -ENV TF_AUTOTUNE_THRESHOLD 2 - -# Create a user that can be used to run the tensorrt-server as -# non-root. Make sure that this user to given ID 1000. -ENV TENSORRT_SERVER_USER=tensorrt-server -RUN id -u $TENSORRT_SERVER_USER > /dev/null 2>&1 || \ - useradd $TENSORRT_SERVER_USER && \ - [ `id -u $TENSORRT_SERVER_USER` -eq 1000 ] && \ - [ `id -g $TENSORRT_SERVER_USER` -eq 1000 ] - -WORKDIR /opt/tensorrtserver -RUN rm -fr /opt/tensorrtserver/* -COPY LICENSE . -COPY --from=trtserver_build /workspace/serving/LICENSE LICENSE.tfserving -COPY --from=trtserver_build /opt/tensorflow/LICENSE LICENSE.tensorflow -COPY --from=trtserver_caffe2 /opt/pytorch/pytorch/LICENSE LICENSE.pytorch -COPY --from=trtserver_build /opt/tensorrtserver/bin/trtserver bin/ -COPY --from=trtserver_build /opt/tensorrtserver/lib lib - -COPY nvidia_entrypoint.sh /opt/tensorrtserver -ENTRYPOINT ["/opt/tensorrtserver/nvidia_entrypoint.sh"] - -ARG NVIDIA_BUILD_ID -ENV NVIDIA_BUILD_ID ${NVIDIA_BUILD_ID:-<unknown>} -LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}" -ARG NVIDIA_BUILD_REF -LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}" diff --git a/Dockerfile.QA b/Dockerfile.QA index bb607c3ac0..68ab519b41 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,72 +24,380 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# Multistage build. -# +ARG BASE_IMAGE=tritonserver +ARG CIBASE_IMAGE=tritonserver_cibase +ARG SDK_IMAGE=tritonserver_sdk +ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server +ARG TRITON_COMMON_REPO_TAG=main +ARG TRITON_CORE_REPO_TAG=main +ARG TRITON_THIRD_PARTY_REPO_TAG=main +ARG TRITON_BACKEND_REPO_TAG=main +ARG TRITONTMP_DIR=/tmp +ARG IGPU_BUILD=0 + +############################################################################ +## Test artifacts built as part of the tritonserver build are +## available in CIBASE_IMAGE. Copy these artifacts into the QA area. +############################################################################ +FROM ${CIBASE_IMAGE} AS cibase + +ARG TRITONTMP_DIR +ARG TRITON_REPO_ORGANIZATION +ARG TRITON_COMMON_REPO_TAG +ARG TRITON_CORE_REPO_TAG +ARG TRITON_THIRD_PARTY_REPO_TAG +ARG TRITON_BACKEND_REPO_TAG +ARG IGPU_BUILD + +# Ensure apt-get won't prompt for selecting options +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libarchive-dev \ + libboost-dev \ + python3-dev \ + python3-pip \ + rapidjson-dev \ + software-properties-common && \ + rm -rf /var/lib/apt/lists/* + +RUN pip3 install --upgrade pip && \ + pip3 install --upgrade wheel setuptools + +RUN apt update -q=2 \ + && apt install -y gpg wget \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && . /etc/os-release \ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && apt-get update -q=2 \ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* + +# Add inception_graphdef model to example repo +WORKDIR /workspace/docs/examples/model_repository +RUN mkdir -p inception_graphdef/1 && \ + wget -O ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb.tar.gz \ + https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz && \ + (cd ${TRITONTMP_DIR} && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz) && \ + mv ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb inception_graphdef/1/model.graphdef + +# Update the qa/ directory with test executables, models, etc. +WORKDIR /workspace +RUN mkdir -p qa/common && \ + cp -r /workspace/src/test/models/repeat_int32 qa/L0_decoupled/models/ && \ + cp -r /workspace/src/test/models/square_int32 qa/L0_decoupled/models/ && \ + mkdir qa/L0_simple_example/models && \ + cp -r docs/examples/model_repository/simple qa/L0_simple_example/models/. && \ + mkdir qa/L0_simple_go_client/models && \ + cp -r docs/examples/model_repository/simple qa/L0_simple_go_client/models/. && \ + mkdir qa/L0_backend_release/simple_models && \ + cp -r docs/examples/model_repository/simple qa/L0_backend_release/simple_models/. && \ + mkdir qa/L0_simple_nodejs_client/models && \ + cp -r docs/examples/model_repository/simple qa/L0_simple_nodejs_client/models/. && \ + mkdir qa/L0_backend_release/simple_seq_models && \ + cp -r /workspace/docs/examples/model_repository/simple_sequence qa/L0_backend_release/simple_seq_models/. && \ + mkdir qa/L0_shared_memory/models && \ + cp -r docs/examples/model_repository/simple qa/L0_shared_memory/models/. && \ + mkdir qa/L0_cuda_shared_memory/models && \ + cp -r docs/examples/model_repository/simple qa/L0_cuda_shared_memory/models/. && \ + mkdir qa/L0_client_java/models && \ + cp -r docs/examples/model_repository/simple qa/L0_client_java/models && \ + mkdir qa/L0_grpc/models && \ + cp -r docs/examples/model_repository/simple qa/L0_grpc/models && \ + cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_grpc/models && \ + cp -r docs/examples/model_repository/simple_int8 qa/L0_grpc/models && \ + cp -r docs/examples/model_repository/simple_identity qa/L0_grpc/models && \ + cp -r docs/examples/model_repository/simple_sequence qa/L0_grpc/models && \ + cp -r docs/examples/model_repository/simple_string qa/L0_grpc/models && \ + cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \ + mkdir qa/L0_grpc_state_cleanup/models && \ + cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \ + mkdir qa/L0_http/models && \ + cp -r docs/examples/model_repository/simple qa/L0_http/models && \ + cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_http/models && \ + cp -r docs/examples/model_repository/simple_identity qa/L0_http/models && \ + cp -r docs/examples/model_repository/simple_sequence qa/L0_http/models && \ + cp -r docs/examples/model_repository/simple_string qa/L0_http/models && \ + cp -r docs/examples/model_repository/inception_graphdef qa/L0_http/models && \ + mkdir qa/L0_https/models && \ + cp -r docs/examples/model_repository/simple qa/L0_https/models/. && \ + mkdir qa/L0_secure_grpc/models && \ + cp -r docs/examples/model_repository/simple qa/L0_secure_grpc/models/. && \ + cp bin/simple qa/L0_simple_lib/. && \ + cp bin/memory_alloc qa/L0_io/. && \ + cp bin/multi_server qa/L0_multi_server/. && \ + cp bin/memory_test qa/L0_memory/. && \ + cp bin/pinned_memory_manager_test qa/L0_memory/. && \ + cp bin/repo_agent_test qa/L0_triton_repo_agent/. && \ + cp lib/libtritonrepoagent_relocation.so qa/L0_triton_repo_agent/. && \ + mkdir qa/L0_query/models/query/1 && \ + cp tritonbuild/tritonserver/backends/query/libtriton_query.so qa/L0_query/models/query/1/. && \ + cp bin/query_test qa/L0_query/. && \ + mkdir qa/L0_iterative_sequence/models/iterative_sequence/1 && \ + cp tritonbuild/tritonserver/backends/iterative_sequence/libtriton_iterative_sequence.so qa/L0_iterative_sequence/models/iterative_sequence/1/. && \ + cp bin/register_api_test qa/L0_register/. && \ + cp bin/async_work_queue_test qa/L0_async_work_queue/. && \ + cp tritonbuild/tritonserver/backends/implicit_state/libtriton_implicit_state.so \ + qa/L0_implicit_state/. && \ + mkdir qa/L0_data_compression/models && \ + cp -r docs/examples/model_repository/simple qa/L0_data_compression/models && \ + cp bin/data_compressor_test qa/L0_data_compression/. && \ + cp bin/metrics_api_test qa/L0_metrics/. && \ + cp bin/response_cache_test qa/L0_response_cache/. && \ + cp bin/request_cancellation_test qa/L0_request_cancellation/. && \ + cp bin/triton_json_test qa/L0_json/. && \ + cp bin/backend_output_detail_test qa/L0_backend_output_detail/. && \ + cp -r deploy/mlflow-triton-plugin qa/L0_mlflow/. && \ + cp bin/input_byte_size_test qa/L0_input_validation/. && \ + cp -r docs/examples/model_repository/simple_identity qa/L0_input_validation/models + +RUN mkdir -p qa/pkgs && \ + cp python/triton*.whl qa/pkgs/. && \ + cp -rf python/test/. qa/L0_python_api/. + +RUN mkdir -p qa/L0_simple_ensemble/models/simple/1 && \ + cp docs/examples/model_repository/simple/1/model.graphdef \ + qa/L0_simple_ensemble/models/simple/1/. && \ + mkdir -p qa/L0_simple_ensemble/models/simple/2 && \ + cp docs/examples/model_repository/simple/1/model.graphdef \ + qa/L0_simple_ensemble/models/simple/2/. && \ + mkdir -p qa/L0_socket/models/simple/1 && \ + cp docs/examples/model_repository/simple/1/model.graphdef \ + qa/L0_socket/models/simple/1/. + +RUN mkdir -p qa/L0_backend_identity/models && \ + cp -r src/test/models/identity_fp32 qa/L0_backend_identity/models/. && \ + mkdir -p qa/L0_backend_identity/models/identity_fp32/1 -ARG BASE_IMAGE=tensorrtserver -ARG BUILD_IMAGE=tensorrtserver_build +RUN mkdir -p qa/custom_models/custom_sequence_int32/1 && \ + cp tritonbuild/tritonserver/backends/sequence/libtriton_sequence.so \ + qa/custom_models/custom_sequence_int32/1/. && \ + mkdir -p qa/custom_models/custom_dyna_sequence_int32/1 && \ + cp tritonbuild/tritonserver/backends/dyna_sequence/libtriton_dyna_sequence.so \ + qa/custom_models/custom_dyna_sequence_int32/1/. + +# L0_lifecycle needs No-GPU build of identity backend. +RUN cd tritonbuild/identity && \ + rm -rf install build && mkdir build && cd build && \ + cmake -DTRITON_ENABLE_GPU=OFF \ + -DCMAKE_INSTALL_PREFIX:PATH=/workspace/tritonbuild/identity/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \ + -DTRITON_THIRD_PARTY_REPO_TAG:STRING=${TRITON_THIRD_PARTY_REPO_TAG} \ + -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} .. && \ + make -j16 install + +# L0_backend_python test require triton_shm_monitor +RUN cd tritonbuild/python && \ + rm -rf install build && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX:PATH=/workspace/tritonbuild/python/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \ + -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} .. && \ + make -j16 triton-shm-monitor install + +RUN cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \ + qa/L0_lifecycle/. && \ + cp tritonbuild/python/install/backends/python/triton_shm_monitor*.so \ + qa/common/. && \ + mkdir -p qa/L0_perf_nomodel/custom_models/custom_zero_1_float32/1 && \ + mkdir -p qa/L0_perf_pyclients/custom_models/custom_zero_1_int32/1 && \ + mkdir -p qa/L0_infer_shm && \ + cp -r qa/L0_infer/. qa/L0_infer_shm && \ + mkdir -p qa/L0_infer_cudashm && \ + cp -r qa/L0_infer/. qa/L0_infer_cudashm && \ + mkdir -p qa/L0_infer_valgrind && \ + cp -r qa/L0_infer/. qa/L0_infer_valgrind && \ + mkdir -p qa/L0_trt_shape_tensors_shm && \ + cp -r qa/L0_trt_shape_tensors/. qa/L0_trt_shape_tensors_shm && \ + mkdir -p qa/L0_trt_shape_tensors_cudashm && \ + cp -r qa/L0_trt_shape_tensors/. qa/L0_trt_shape_tensors_cudashm && \ + mkdir -p qa/L0_batcher_shm && \ + cp -r qa/L0_batcher/. qa/L0_batcher_shm && \ + mkdir -p qa/L0_batcher_cudashm && \ + cp -r qa/L0_batcher/. qa/L0_batcher_cudashm && \ + mkdir -p qa/L0_batcher_valgrind && \ + cp -r qa/L0_batcher/. qa/L0_batcher_valgrind && \ + mkdir -p qa/L0_sequence_batcher_shm && \ + cp -r qa/L0_sequence_batcher/. qa/L0_sequence_batcher_shm && \ + mkdir -p qa/L0_sequence_batcher_cudashm && \ + cp -r qa/L0_sequence_batcher/. qa/L0_sequence_batcher_cudashm && \ + mkdir -p qa/L0_sequence_batcher_valgrind && \ + cp -r qa/L0_sequence_batcher/. qa/L0_sequence_batcher_valgrind && \ + mkdir -p qa/L0_perf_nomodel_shm && \ + cp -r qa/L0_perf_nomodel/. qa/L0_perf_nomodel_shm && \ + mkdir -p qa/L0_perf_nomodel_cudashm && \ + cp -r qa/L0_perf_nomodel/. qa/L0_perf_nomodel_cudashm + +# L0_model_control_stress will not be present if gitlab tests are not available +RUN if [ -d qa/L0_model_control_stress ]; then \ + mkdir -p qa/L0_model_control_stress_valgrind && \ + cp -r qa/L0_model_control_stress/. qa/L0_model_control_stress_valgrind && \ + mkdir -p qa/L0_model_control_stress_valgrind_massif && \ + cp -r qa/L0_model_control_stress/. qa/L0_model_control_stress_valgrind_massif; \ + fi + +RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \ + mkdir -p qa/L0_decoupled/models/square_int32/1 && \ + mkdir -p qa/L0_decoupled/models/identity_int32/1 && \ + mkdir -p qa/L0_decoupled/models/simple_repeat/1 && \ + mkdir -p qa/L0_decoupled/models/fan_repeat/1 && \ + mkdir -p qa/L0_decoupled/models/sequence_repeat/1 && \ + mkdir -p qa/L0_decoupled/models/repeat_square/1 && \ + mkdir -p qa/L0_decoupled/models/nested_square/1 && \ + mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 + +RUN if [ "$IGPU_BUILD" == "0" ]; then \ + cp backends/repeat/libtriton_repeat.so qa/L0_model_config && \ + cp backends/repeat/libtriton_repeat.so qa/L0_decoupled/models/repeat_int32/1 && \ + cp backends/repeat/libtriton_repeat.so qa/L0_grpc_state_cleanup/models/repeat_int32/1/. && \ + cp backends/square/libtriton_square.so qa/L0_decoupled/models/square_int32/1; \ + fi + +RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \ + cp /workspace/tritonbuild/python/examples/decoupled/repeat_model.py \ + qa/L0_decoupled/python_models/repeat_int32/1/. && \ + cp /workspace/tritonbuild/python/examples/decoupled/repeat_config.pbtxt \ + qa/L0_decoupled/python_models/repeat_int32/. && \ + cp /workspace/tritonbuild/python/examples/decoupled/square_model.py \ + qa/L0_decoupled/python_models/square_int32/1/. && \ + cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \ + qa/L0_decoupled/python_models/square_int32/. + +RUN mkdir -p qa/L0_decoupled_grpc_error && \ + cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error + +RUN mkdir -p qa/L0_grpc_error_state_cleanup && \ + cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup + +RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \ + cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \ + qa/L0_repoagent_checksum/models/identity_int32/1/. +RUN mkdir -p qa/L0_passive_instance/models/distributed_int32_int32_int32/1 && \ + cp tritonbuild/tritonserver/backends/distributed_addsub/libtriton_distributed_addsub.so \ + qa/L0_passive_instance/models/distributed_int32_int32_int32/1/. ############################################################################ -## Build necessary artifacts needed for CI and initialize the qa/ directory. +## Copy artifacts from sdk container ############################################################################ -FROM ${BUILD_IMAGE} AS trtserver_qa +FROM ${SDK_IMAGE} AS sdk +ARG TARGETPLATFORM WORKDIR /workspace +COPY --from=cibase /workspace/qa/ qa/ RUN mkdir -p qa/clients && mkdir -p qa/pkgs && \ - cp src/clients/python/grpc_image_client.py qa/clients/. && \ - cp src/clients/python/image_client.py qa/clients/. && \ - cp src/clients/python/simple_client.py qa/clients/. && \ - cp /opt/tensorrtserver/bin/image_client qa/clients/. && \ - cp /opt/tensorrtserver/bin/perf_client qa/clients/. && \ - cp /opt/tensorrtserver/bin/simple_client qa/clients/. && \ - cp /opt/tensorrtserver/bin/caffe2plan qa/common/. && \ - cp /opt/tensorrtserver/pip/tensorrtserver*.whl qa/pkgs/. && \ - mkdir qa/L0_simple_example/models && \ - cp -r docs/examples/model_repository/simple qa/L0_simple_example/models/. + cp -a install/bin/* qa/clients/. && \ + cp install/lib/libgrpcclient.so qa/clients/. && \ + cp install/lib/libhttpclient.so qa/clients/. && \ + cp install/python/*.py qa/clients/. && \ + cp install/python/triton*.whl qa/pkgs/. && \ + cp install/java/examples/*.jar qa/clients/. +RUN cp client/src/grpc_generated/go/*.go qa/L0_simple_go_client/. && \ + cp client/src/grpc_generated/javascript/*.js qa/L0_simple_nodejs_client/. && \ + cp client/src/grpc_generated/javascript/*.json qa/L0_simple_nodejs_client/. && \ + cp -r client/src/grpc_generated/java qa/L0_client_java/. ############################################################################ ## Create CI enabled image ############################################################################ FROM $BASE_IMAGE -ARG PYVER=3.5 +ARG TARGETPLATFORM + +# Ensure apt-get won't prompt for selecting options +ENV DEBIAN_FRONTEND=noninteractive + +# install platform specific packages +RUN if [ $(cat /etc/os-release | grep 'VERSION_ID="20.04"' | wc -l) -ne 0 ]; then \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + libpng-dev; \ + elif [ $(cat /etc/os-release | grep 'VERSION_ID="22.04"' | wc -l) -ne 0 ]; then \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + libpng-dev; \ + elif [ $(cat /etc/os-release | grep 'VERSION_ID="18.04"' | wc -l) -ne 0 ]; then \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + libpng-dev; \ + else \ + echo "Ubuntu version must be either 18.04, 20.04 or 22.04" && \ + exit 1; \ + fi +# CI/QA for memcheck requires valgrind +# libarchive-dev is required by Python backend RUN apt-get update && apt-get install -y --no-install-recommends \ - jmeter \ - jmeter-http \ - libcurl3 \ + curl \ + gdb \ libopencv-dev \ + libarchive-dev \ libopencv-core-dev \ - libpng12-dev \ libzmq3-dev \ - python$PYVER \ - python$PYVER-dev \ - python$PYVER-numpy \ - python`echo $PYVER | cut -c1-1`-pil \ - python-protobuf \ - swig && \ + maven \ + openjdk-11-jdk \ + nginx \ + npm \ + protobuf-compiler \ + python3-dev \ + python3-pip \ + python3-protobuf \ + python3-setuptools \ + swig \ + valgrind && \ rm -rf /var/lib/apt/lists/* -# Use the PYVER version of python +# CI/QA expects "python" executable (not python3). RUN rm -f /usr/bin/python && \ - rm -f /usr/bin/python`echo $PYVER | cut -c1-1` && \ - ln -s /usr/bin/python$PYVER /usr/bin/python && \ - ln -s /usr/bin/python$PYVER /usr/bin/python`echo $PYVER | cut -c1-1` + ln -s /usr/bin/python3 /usr/bin/python -RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ - python$PYVER get-pip.py && \ - rm get-pip.py -RUN pip install --upgrade numpy future grpcio +RUN pip3 install --upgrade wheel setuptools && \ + pip3 install --upgrade "numpy<2" pillow attrdict future grpcio requests gsutil \ + awscli six grpcio-channelz prettytable virtualenv \ + check-jsonschema -# CI expects tests in /opt/tensorrtserver/qa -WORKDIR /opt/tensorrtserver -COPY --from=trtserver_qa /workspace/qa/ qa/ +# go needed for example go client test. +RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + wget https://golang.org/dl/go1.22.3.linux-arm64.tar.gz && \ + rm -rf /usr/local/go && tar -C /usr/local -xzf go1.22.3.linux-arm64.tar.gz && \ + rm -f go1.22.3.linux-arm64.tar.gz; \ + else \ + wget https://golang.org/dl/go1.22.3.linux-amd64.tar.gz && \ + rm -rf /usr/local/go && tar -C /usr/local -xzf go1.22.3.linux-amd64.tar.gz && \ + rm -f go1.22.3.linux-amd64.tar.gz; \ + fi +ENV GOPATH /root/go +ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin +RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \ + go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest +# CI expects tests in /opt/tritonserver/qa. The triton-server (1000) +# user should own all artifacts in case CI is run using triton-server +# user. +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=sdk /workspace/qa/ qa/ # Remove CI tests that are meant to run only on build image and -# install the tensorrtserver python client APIs. -RUN rm -fr qa/L0_copyrights qa/L0_unit_test qa/L1_tfs_unit_test && \ - pip install --upgrade qa/pkgs/tensorrtserver-*.whl +# install the tritonserver/triton python client APIs. +RUN rm -fr qa/L0_copyrights qa/L0_build_variants && \ + find qa/pkgs/ -maxdepth 1 -type f -name \ + "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \ + xargs pip3 install --upgrade + +# Install Triton Python API +RUN find qa/pkgs/ -maxdepth 1 -type f -name \ + "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all] + +# Install Triton Frontend Python API +RUN find qa/pkgs/ -type f -name \ + "tritonfrontend-*.whl" | xargs -I {} pip3 install --upgrade {}[all] + +ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH} + +# DLIS-3631: Needed to run Perf Analyzer CI tests correctly +ENV LD_LIBRARY_PATH /opt/hpcx/ompi/lib:${LD_LIBRARY_PATH} -ENV PYVER ${PYVER} +# Required for PyTorch to pickup the correct HPCX libraries +ENV LD_LIBRARY_PATH /opt/hpcx/ucc/lib/:/opt/hpcx/ucx/lib/:${LD_LIBRARY_PATH} diff --git a/Dockerfile.sdk b/Dockerfile.sdk new file mode 100644 index 0000000000..5ddaf7274f --- /dev/null +++ b/Dockerfile.sdk @@ -0,0 +1,286 @@ +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# Multistage build. +# + +# Base image on the minimum Triton container +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.09-py3-min + +ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo +ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo +ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server +ARG TRITON_COMMON_REPO_TAG=main +ARG TRITON_CORE_REPO_TAG=main +ARG TRITON_CLIENT_REPO_TAG=main +ARG TRITON_THIRD_PARTY_REPO_TAG=main +ARG TRITON_MODEL_ANALYZER_REPO_TAG=main +ARG TRITON_ENABLE_GPU=ON +ARG JAVA_BINDINGS_MAVEN_VERSION=3.8.4 +ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG=1.5.8 + +# DCGM version to install for Model Analyzer +ARG DCGM_VERSION=3.2.6 + +ARG NVIDIA_TRITON_SERVER_SDK_VERSION=unknown +ARG NVIDIA_BUILD_ID=unknown + +############################################################################ +## Build image +############################################################################ + +FROM ${BASE_IMAGE} AS sdk_build + +# Ensure apt-get won't prompt for selecting options +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + software-properties-common \ + autoconf \ + automake \ + build-essential \ + curl \ + git \ + gperf \ + libb64-dev \ + libgoogle-perftools-dev \ + libopencv-dev \ + libopencv-core-dev \ + libssl-dev \ + libtool \ + pkg-config \ + python3 \ + python3-pip \ + python3-dev \ + rapidjson-dev \ + vim \ + wget \ + python3-pdfkit \ + openjdk-11-jdk \ + maven && \ + pip3 install --upgrade wheel setuptools && \ + pip3 install --upgrade grpcio-tools && \ + pip3 install --upgrade pip + +# Client build requires recent version of CMake (FetchContent required) +# Using CMAKE installation instruction from:: https://apt.kitware.com/ +RUN apt update -q=2 \ + && apt install -y gpg wget \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && . /etc/os-release \ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && apt-get update -q=2 \ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* \ + && cmake --version + +# Build expects "python" executable (not python3). +RUN rm -f /usr/bin/python && \ + ln -s /usr/bin/python3 /usr/bin/python + +# Build the client library and examples +ARG TRITON_REPO_ORGANIZATION +ARG TRITON_CLIENT_REPO_SUBDIR +ARG TRITON_PA_REPO_SUBDIR +ARG TRITON_COMMON_REPO_TAG +ARG TRITON_CORE_REPO_TAG +ARG TRITON_CLIENT_REPO_TAG +ARG TRITON_THIRD_PARTY_REPO_TAG +ARG TRITON_ENABLE_GPU +ARG JAVA_BINDINGS_MAVEN_VERSION +ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG +ARG TARGETPLATFORM + +WORKDIR /workspace +COPY TRITON_VERSION . +COPY ${TRITON_CLIENT_REPO_SUBDIR} client +COPY ${TRITON_PA_REPO_SUBDIR} perf_analyzer + +WORKDIR /workspace/client_build +RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \ + -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ + -DTRITON_ENABLE_PERF_ANALYZER=OFF \ + -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \ + -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF \ + -DTRITON_ENABLE_JAVA_HTTP=ON \ + -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \ + -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client +RUN make -j16 cc-clients java-clients && \ + rm -fr ~/.m2 + +# TODO: PA will rebuild the CC clients since it depends on it. +# This should be optimized so that we do not have to build +# the CC clients twice. Similarly, because the SDK expectation is +# that PA is packaged with the python client, we hold off on building +# the python client until now. Post-migration we should focus +# effort on de-tangling these flows. +WORKDIR /workspace/pa_build +RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \ + -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \ + -DTRITON_ENABLE_CC_HTTP=ON \ + -DTRITON_ENABLE_CC_GRPC=ON \ + -DTRITON_ENABLE_PYTHON_HTTP=ON \ + -DTRITON_ENABLE_PYTHON_GRPC=ON \ + -DTRITON_PACKAGE_PERF_ANALYZER=ON \ + -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \ + /workspace/perf_analyzer +RUN make -j16 perf-analyzer python-clients + +RUN pip3 install build \ + && cd /workspace/perf_analyzer/genai-perf \ + && python3 -m build --wheel --outdir /workspace/install/python + +# Install Java API Bindings +RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ + source /workspace/client/src/java-api-bindings/scripts/install_dependencies_and_build.sh \ + --maven-version ${JAVA_BINDINGS_MAVEN_VERSION} \ + --core-tag ${TRITON_CORE_REPO_TAG} \ + --javacpp-tag ${JAVA_BINDINGS_JAVACPP_PRESETS_TAG} \ + --jar-install-path /workspace/install/java-api-bindings; \ + fi + +############################################################################ +## Create sdk container +############################################################################ +FROM ${BASE_IMAGE} + +# Ensure apt-get won't prompt for selecting options +ENV DEBIAN_FRONTEND=noninteractive + +ARG DCGM_VERSION +ARG TRITON_REPO_ORGANIZATION +ARG TRITON_CORE_REPO_TAG +ARG TARGETPLATFORM +ARG TRITON_ENABLE_GPU + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common \ + curl \ + git \ + gperf \ + libb64-dev \ + libgoogle-perftools-dev \ + libopencv-dev \ + libopencv-core-dev \ + libssl-dev \ + libtool \ + python3 \ + python3-pip \ + python3-dev \ + vim \ + wget \ + python3-pdfkit \ + maven \ + default-jdk && \ + pip3 install --upgrade wheel setuptools && \ + pip3 install --upgrade grpcio-tools && \ + pip3 install --upgrade pip + +WORKDIR /workspace +COPY TRITON_VERSION . +COPY NVIDIA_Deep_Learning_Container_License.pdf . +COPY --from=sdk_build /workspace/client/ client/ +COPY --from=sdk_build /workspace/perf_analyzer/ perf_analyzer/ +COPY --from=sdk_build /workspace/install/ install/ +RUN cd install && \ + export VERSION=`cat /workspace/TRITON_VERSION` && \ + tar zcf /workspace/v$VERSION.clients.tar.gz * + +# For CI testing need to copy over L0_sdk test and L0_client_build_variants test. +RUN mkdir qa +COPY qa/L0_sdk qa/L0_sdk +COPY qa/L0_client_build_variants qa/L0_client_build_variants + +# Create a directory for all the python client tests to enable unit testing +RUN mkdir -p qa/python_client_unit_tests/ +COPY --from=sdk_build /workspace/client/src/python/library/tests/* qa/python_client_unit_tests/ + +# Install an image needed by the quickstart and other documentation. +COPY qa/images/mug.jpg images/mug.jpg + +# Install the dependencies needed to run the client examples. These +# are not needed for building but including them allows this image to +# be used to run the client examples. +RUN pip3 install --upgrade "numpy<2" pillow attrdict && \ + find install/python/ -maxdepth 1 -type f -name \ + "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \ + xargs pip3 install --upgrade + +RUN pip3 install install/python/genai_perf-*.whl + +# Install DCGM +RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \ + [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \ + curl -o /tmp/cuda-keyring.deb \ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \ + && apt install /tmp/cuda-keyring.deb && rm /tmp/cuda-keyring.deb && \ + apt-get update && apt-get install -y datacenter-gpu-manager=1:${DCGM_VERSION}; \ + fi + +# Build expects "python" executable (not python3). +RUN rm -f /usr/bin/python && \ + ln -s /usr/bin/python3 /usr/bin/python + +# Install Model Analyzer +ARG TRITON_MODEL_ANALYZER_REPO_TAG +ARG TRITON_MODEL_ANALYZER_REPO="${TRITON_REPO_ORGANIZATION}/model_analyzer@${TRITON_MODEL_ANALYZER_REPO_TAG}" +RUN pip3 install "git+${TRITON_MODEL_ANALYZER_REPO}" + +# Entrypoint Banner +ENV NVIDIA_PRODUCT_NAME="Triton Server SDK" +COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/ +RUN sed 's/Server/Server SDK/' /opt/nvidia/entrypoint.d/10-banner.txt | \ + sed 's/^===/=======/' > /opt/nvidia/entrypoint.d/10-banner.new && \ + mv /opt/nvidia/entrypoint.d/10-banner.new /opt/nvidia/entrypoint.d/10-banner.txt + +ARG NVIDIA_TRITON_SERVER_SDK_VERSION +ARG NVIDIA_BUILD_ID +ENV NVIDIA_TRITON_SERVER_SDK_VERSION=${NVIDIA_TRITON_SERVER_SDK_VERSION} +ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID} + +ENV PATH /workspace/install/bin:${PATH} +ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH} + +# DLIS-3631: Needed to run Perf Analyzer CI tests correctly +ENV LD_LIBRARY_PATH /opt/hpcx/ompi/lib:${LD_LIBRARY_PATH} + +# Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc +ENV TCMALLOC_RELEASE_RATE 200 diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min new file mode 100644 index 0000000000..dec972eaf3 --- /dev/null +++ b/Dockerfile.win10.min @@ -0,0 +1,197 @@ +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Windows min container for Triton build + +ARG BASE_IMAGE=mcr.microsoft.com/windows:10.0.19042.1889 + +FROM ${BASE_IMAGE} as dependency_base + +RUN powershell.exe Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine +RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.ServicePointManager]::SecurityProtocol=[Net.SecurityProtocolType]::Tls,[Net.SecurityProtocolType]::Tls11,[Net.SecurityProtocolType]::Tls12,[Net.SecurityProtocolType]::Ssl3;Invoke-Expression( New-Object System.Net.WebClient ).DownloadString('https://chocolatey.org/install.ps1') +RUN choco install unzip -y + +# +# Installing TensorRT +# +ARG TENSORRT_VERSION=10.4.0.26 +ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip" +ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip +# COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP} +ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP} +RUN unzip /tmp/%TENSORRT_ZIP% +RUN move TensorRT-* TensorRT + +LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" + + +# +# Installing cuDNN +# +ARG CUDNN_VERSION=9.4.0.58 +ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip +ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip +ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} +RUN unzip /tmp/%CUDNN_ZIP% +RUN move cudnn-* cudnn + +LABEL CUDNN_VERSION="${CUDNN_VERSION}" + + +FROM ${BASE_IMAGE} as build_base + +SHELL ["cmd", "/S", "/C"] + +RUN mkdir c:\tmp +WORKDIR /tmp + +RUN powershell.exe Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine +RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.ServicePointManager]::SecurityProtocol=[Net.SecurityProtocolType]::Tls,[Net.SecurityProtocolType]::Tls11,[Net.SecurityProtocolType]::Tls12,[Net.SecurityProtocolType]::Ssl3;Invoke-Expression( New-Object System.Net.WebClient ).DownloadString('https://chocolatey.org/install.ps1') +RUN choco install git docker unzip -y + +# +# Installing python +# +ARG PYTHON_VERSION=3.10.11 +ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe +ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe +RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%" +RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe" +RUN pip install --upgrade wheel setuptools docker +RUN pip install grpcio-tools psutil + +LABEL PYTHON_VERSION=${PYTHON_VERSION} + +# +# Installing CMake +# +ARG CMAKE_VERSION=3.30.0 +RUN pip install cmake==%CMAKE_VERSION% + +ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake +ENV VCPKG_TARGET_TRIPLET x64-windows + +LABEL CMAKE_VERSION=${CMAKE_VERSION} + +# Be aware that pip can interact badly with VS cmd shell so need to pip install before +# vsdevcmd.bat (see https://bugs.python.org/issue38989) +# +# Installing Visual Studio BuildTools: VS17 2022 +# +ARG BUILDTOOLS_VERSION=17.10.35201.131 +# Download collect.exe in case of an install failure. +ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe" + +# Use the latest release channel. For more control, specify the location of an internal layout. +# Download the Build Tools bootstrapper. +# ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe +ARG BUILD_TOOLS_SOURCE=https://download.visualstudio.microsoft.com/download/pr/28626b4b-f88f-4b55-a0cf-f3eaa2c643fb/e6c43d4dfb36338d954cdb3ad9010ab2a479e712088f4f6b016eadcc721bab28/vs_BuildTools.exe +ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe +# Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended. +ARG VS_INSTALL_PATH_WP="C:\BuildTools" +RUN vs_buildtools.exe --quiet --wait --norestart --nocache install \ + --installPath %VS_INSTALL_PATH_WP% \ + --add Microsoft.VisualStudio.Workload.VCTools \ + --includeRecommended \ + --locale "En-us" + +LABEL BUILDTOOLS_VERSION=${BUILDTOOLS_VERSION} + +WORKDIR / + +# +# Installing Vcpkg +# +ARG VCPGK_VERSION=2024.03.19 +RUN git clone --single-branch --depth=1 -b %VCPGK_VERSION% https://github.com/microsoft/vcpkg.git +WORKDIR /vcpkg +RUN bootstrap-vcpkg.bat +RUN vcpkg.exe update +RUN vcpkg.exe install \ + boost-interprocess:x64-windows \ + boost-stacktrace:x64-windows \ + b64:x64-windows \ + openssl-windows:x64-windows \ + openssl:x64-windows \ + pthread:x64-windows \ + rapidjson:x64-windows \ + zlib:x64-windows +RUN vcpkg.exe integrate install + +LABEL VCPGK_VERSION=${VCPGK_VERSION} + +WORKDIR / + +# +# Installing CUDA +# +ARG CUDA_MAJOR=12 +ARG CUDA_MINOR=5 +ARG CUDA_PATCH=1 +ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH} +ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \ + cudart_${CUDA_MAJOR}.${CUDA_MINOR} \ + nvml_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + cublas_${CUDA_MAJOR}.${CUDA_MINOR} cublas_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + cufft_${CUDA_MAJOR}.${CUDA_MINOR} cufft_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + curand_${CUDA_MAJOR}.${CUDA_MINOR} curand_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + cusolver_${CUDA_MAJOR}.${CUDA_MINOR} cusolver_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + cusparse_${CUDA_MAJOR}.${CUDA_MINOR} cusparse_dev_${CUDA_MAJOR}.${CUDA_MINOR} \ + cupti_${CUDA_MAJOR}.${CUDA_MINOR} \ + thrust_${CUDA_MAJOR}.${CUDA_MINOR} \ + visual_studio_integration_${CUDA_MAJOR}.${CUDA_MINOR}" +ARG CUDA_INSTALL_ROOT_WP="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${CUDA_MAJOR}.${CUDA_MINOR}" + +ARG CUDA_SOURCE=https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/network_installers/cuda_${CUDA_VERSION}_windows_network.exe +ADD ${CUDA_SOURCE} cuda_${CUDA_VERSION}_windows_network.exe + +RUN cuda_%CUDA_VERSION%_windows_network.exe -s %CUDA_PACKAGES% +# Copy the CUDA visualstudio integration from where it was installed +# into the appropriate place in BuildTools +RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensions\*" "%VS_INSTALL_PATH_WP%\MSBuild\Microsoft\VC\v170\BuildCustomizations" + +RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" + +ARG CUDNN_VERSION=9.4.0.58 +ENV CUDNN_VERSION ${CUDNN_VERSION} +COPY --from=dependency_base /cudnn /cudnn +RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." +RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." +RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\." +LABEL CUDNN_VERSION="${CUDNN_VERSION}" + +ARG TENSORRT_VERSION=10.4.0.26 +ENV TRT_VERSION ${TENSORRT_VERSION} +COPY --from=dependency_base /TensorRT /TensorRT +RUN setx PATH "c:\TensorRT\lib;%PATH%" +LABEL TENSORRT_VERSION="${TENSORRT_VERSION}" + +LABEL CUDA_VERSION="${CUDA_VERSION}" +# It is important that the entrypoint initialize VisualStudio +# environment otherwise the build will fail. Also set +# CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so +# that cmake can find the packages installed by vcpkg. +ENTRYPOINT C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat && diff --git a/LICENSE b/LICENSE index 8d2301c1f9..5529809efc 100644 --- a/LICENSE +++ b/LICENSE @@ -1,25 +1,25 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/NVIDIA_Deep_Learning_Container_License.pdf b/NVIDIA_Deep_Learning_Container_License.pdf new file mode 100644 index 0000000000..bfdce390f3 Binary files /dev/null and b/NVIDIA_Deep_Learning_Container_License.pdf differ diff --git a/README.md b/README.md new file mode 100644 index 0000000000..09d7feaca6 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +<!-- +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +# Triton Inference Server + +[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) + +>[!WARNING] +> You are currently on the `r24.10` branch which tracks under-development progress towards the next release. <br> \ No newline at end of file diff --git a/README.rst b/README.rst deleted file mode 100644 index b8a516266d..0000000000 --- a/README.rst +++ /dev/null @@ -1,113 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -|License| - -NVIDIA TensorRT Inference Server -================================ - - - **NOTE: You are currently on the master branch which tracks - under-development progress towards the next release. The latest - release of the TensorRT Inference Server is 0.8.0 beta and is - available on branch** `r18.11 - <https://github.com/NVIDIA/tensorrt-inference-server/tree/r18.11>`_. - -.. overview-begin-marker-do-not-remove - -The NVIDIA TensorRT Inference Server (TRTIS) provides a cloud -inferencing solution optimized for NVIDIA GPUs. The server provides an -inference service via an HTTP or gRPC endpoint, allowing remote -clients to request inferencing for any model being managed by the -server. TRTIS provides the following features: - -* `Multiple framework support <https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/model_repository.html#model-definition>`_. The server can manage any number and mix of - models (limited by system disk and memory resources). Supports - TensorRT, TensorFlow GraphDef, TensorFlow SavedModel and Caffe2 - NetDef model formats. Also supports TensorFlow-TensorRT integrated - models. -* Multi-GPU support. The server can distribute inferencing across all - system GPUs. -* `Concurrent model execution support <https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/model_configuration.html?highlight=batching#instance-groups>`_. Multiple models (or multiple instances of the - same model) can run simultaneously on the same GPU. -* Batching support. For models that support batching, the server can - accept requests for a batch of inputs and respond with the - corresponding batch of outputs. The server also supports `dynamic - batching <https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/model_configuration.html?highlight=batching#dynamic-batching>`_ where individual inference requests are dynamically - combined together to improve inference throughput. Dynamic batching - is transparent to the client requesting inference. -* `Model repositories <https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/model_repository.html#>`_ may reside on a locally accessible file system (e.g. NFS) or - in Google Cloud Storage. -* Readiness and liveness `health endpoints <https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/http_grpc_api.html#health>`_ suitable for any orchestration or deployment framework, such as Kubernetes. -* `Metrics <https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/metrics.html>`_ indicating GPU utiliization, server throughput, and server - latency. - -.. overview-end-marker-do-not-remove - -The current release of the TensorRT Inference Server is 0.8.0 beta and -corresponds to the 18.11 release of the tensorrtserver container on -`NVIDIA GPU Cloud (NGC) <https://ngc.nvidia.com>`_. The branch for -this release is `r18.11 -<https://github.com/NVIDIA/tensorrt-inference-server/tree/r18.11>`_. The -User Guide, Developer Guide, and API Reference `documentation -<https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/index.html>`_ -provide guidance on installing, building and running TRTIS. - -You can also view the documentation for the `master branch -<https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-master-branch-guide/docs/index.html>`_ -and for `earlier releases -<https://docs.nvidia.com/deeplearning/sdk/inference-server-archived/index.html>`_. - -Contributing ------------- - -Contributions to TensorRT Inference Server are more than welcome. To -contribute make a pull request and follow the guidelines outlined in -the `Contributing <CONTRIBUTING.md>`_ document. - -Reporting problems, asking questions ------------------------------------- - -We appreciate any feedback, questions or bug reporting regarding this -project. When help with code is needed, follow the process outlined in -the Stack Overflow (https://stackoverflow.com/help/mcve) -document. Ensure posted examples are: - -* minimal – use as little code as possible that still produces the - same problem - -* complete – provide all parts needed to reproduce the problem. Check - if you can strip external dependency and still show the problem. The - less time we spend on reproducing problems the more time we have to - fix it - -* verifiable – test the code you're about to provide to make sure it - reproduces the problem. Remove all other problems that are not - related to your request/question. - -.. |License| image:: https://img.shields.io/badge/License-BSD3-lightgrey.svg - :target: https://opensource.org/licenses/BSD-3-Clause diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..7aa39f4e5d --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,44 @@ +<!-- +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +# Report a Security Vulnerability + +To report a potential security vulnerability in any NVIDIA product, please use either: +* This web form: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html), or +* Send email to: [NVIDIA PSIRT](mailto:psirt@nvidia.com) + +**OEM Partners should contact their NVIDIA Customer Program Manager** + +If reporting a potential vulnerability via email, please encrypt it using NVIDIA’s public PGP key ([see PGP Key page](https://www.nvidia.com/en-us/security/pgp-key/)) and include the following information: +1. Product/Driver name and version/branch that contains the vulnerability +2. Type of vulnerability (code execution, denial of service, buffer overflow, etc.) +3. Instructions to reproduce the vulnerability +4. Proof-of-concept or exploit code +5. Potential impact of the vulnerability, including how an attacker could exploit the vulnerability + +See https://www.nvidia.com/en-us/security/ for past NVIDIA Security Bulletins and Notices. diff --git a/TRITON_VERSION b/TRITON_VERSION new file mode 100644 index 0000000000..40af962ea2 --- /dev/null +++ b/TRITON_VERSION @@ -0,0 +1 @@ +2.51.0 \ No newline at end of file diff --git a/Triton-CCLA-v1.pdf b/Triton-CCLA-v1.pdf new file mode 100644 index 0000000000..d08afc8183 Binary files /dev/null and b/Triton-CCLA-v1.pdf differ diff --git a/VERSION b/VERSION deleted file mode 100644 index 7382a313f5..0000000000 --- a/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.10.0dev diff --git a/WORKSPACE b/WORKSPACE deleted file mode 100644 index afb9d3217c..0000000000 --- a/WORKSPACE +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -workspace(name = "inference_server") - -local_repository( - name = "org_tensorflow", - path = "/opt/tensorflow/", -) - -local_repository( - name = "tf_serving", - path = __workspace_dir__ + "/serving/", -) - -new_local_repository( - name = "extern_lib", - path = "/opt/tensorrtserver/lib", - build_file_content = """ -cc_library( - name = "libcaffe2", - srcs = ["libcaffe2.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libcaffe2_gpu", - srcs = ["libcaffe2_gpu.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libcaffe2_detectron_ops_gpu", - srcs = ["libcaffe2_detectron_ops_gpu.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libc10", - srcs = ["libc10.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libmkl_core", - srcs = ["libmkl_core.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libmkl_gnu_thread", - srcs = ["libmkl_gnu_thread.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libmkl_avx2", - srcs = ["libmkl_avx2.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libmkl_def", - srcs = ["libmkl_def.so"], - visibility = ["//visibility:public"], -) -cc_library( - name = "libmkl_intel_lp64", - srcs = ["libmkl_intel_lp64.so"], - visibility = ["//visibility:public"], -) -""", -) - -# Need prometheus for metrics -http_archive( - name = "prometheus", - strip_prefix = "prometheus-cpp-0.5.0", - urls = ["https://github.com/jupp0r/prometheus-cpp/archive/v0.5.0.tar.gz"], -) -load("@prometheus//:repositories.bzl", "load_civetweb") -load_civetweb() - -# TensorFlow depends on "io_bazel_rules_closure" so we need this here. -# Needs to be kept in sync with the same target in TensorFlow's WORKSPACE file. -http_archive( - name = "io_bazel_rules_closure", - sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae", - strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1", - urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz", - "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz", # 2018-04-13 - ], -) - -load('@tf_serving//tensorflow_serving:workspace.bzl', 'tf_serving_workspace') -tf_serving_workspace() - -# Specify the minimum required bazel version. -load("@org_tensorflow//tensorflow:version_check.bzl", "check_bazel_version_at_least") - -check_bazel_version_at_least("0.15.0") diff --git a/build.py b/build.py new file mode 100755 index 0000000000..9449d34826 --- /dev/null +++ b/build.py @@ -0,0 +1,2962 @@ +#!/usr/bin/env python3 +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import importlib.util +import multiprocessing +import os +import os.path +import pathlib +import platform +import stat +import subprocess +import sys +from inspect import getsourcefile + +import distro +import requests + +# +# Build Triton Inference Server. +# + +# By default build.py builds the Triton Docker image, but can also be +# used to build without Docker. See docs/build.md and --help for more +# information. +# +# The TRITON_VERSION file indicates the Triton version and +# TRITON_VERSION_MAP is used to determine the corresponding container +# version and upstream container version (upstream containers are +# dependencies required by Triton). These versions may be overridden. + +# Map from Triton version to corresponding container and component versions. +# +# triton version -> +# (triton container version, +# upstream container version, +# ORT version, +# ORT OpenVINO version (use None to disable OpenVINO in ORT), +# Standalone OpenVINO version, +# DCGM version +# ) +# +# Currently the OpenVINO versions used in ORT and standalone must +# match because of the way dlopen works with loading the backends. If +# different versions are used then one backend or the other will +# incorrectly load the other version of the openvino libraries. +# +TRITON_VERSION_MAP = { + "2.51.0": ( + "24.10", # triton container + "24.10", # upstream container + "1.19.2", # ORT + "2024.0.0", # ORT OpenVINO + "2024.0.0", # Standalone OpenVINO + "3.2.6", # DCGM version + "0.5.3.post1", # vLLM version + ) +} + +CORE_BACKENDS = ["ensemble"] + +FLAGS = None +EXTRA_CORE_CMAKE_FLAGS = {} +OVERRIDE_CORE_CMAKE_FLAGS = {} +EXTRA_BACKEND_CMAKE_FLAGS = {} +OVERRIDE_BACKEND_CMAKE_FLAGS = {} + +THIS_SCRIPT_DIR = os.path.dirname(os.path.abspath(getsourcefile(lambda: 0))) + + +def log(msg, force=False): + if force or not FLAGS.quiet: + try: + print(msg, file=sys.stderr) + except Exception: + print("<failed to log>", file=sys.stderr) + + +def log_verbose(msg): + if FLAGS.verbose: + log(msg, force=True) + + +def fail(msg): + fail_if(True, msg) + + +def fail_if(p, msg): + if p: + print("error: {}".format(msg), file=sys.stderr) + sys.exit(1) + + +def target_platform(): + # When called by compose.py, FLAGS will be None + if FLAGS and FLAGS.target_platform is not None: + return FLAGS.target_platform + platform_string = platform.system().lower() + if platform_string == "linux": + # Need to inspect the /etc/os-release file to get + # the distribution of linux + id_like_list = distro.like().split() + if "debian" in id_like_list: + return "linux" + else: + return "rhel" + else: + return platform_string + + +def target_machine(): + # When called by compose.py, FLAGS will be None + if FLAGS and FLAGS.target_machine is not None: + return FLAGS.target_machine + return platform.machine().lower() + + +def container_versions(version, container_version, upstream_container_version): + if container_version is None: + if version not in TRITON_VERSION_MAP: + fail("container version not known for {}".format(version)) + container_version = TRITON_VERSION_MAP[version][0] + if upstream_container_version is None: + if version not in TRITON_VERSION_MAP: + fail("upstream container version not known for {}".format(version)) + upstream_container_version = TRITON_VERSION_MAP[version][1] + return container_version, upstream_container_version + + +class BuildScript: + """Utility class for writing build scripts""" + + def __init__(self, filepath, desc=None, verbose=False): + self._filepath = filepath + self._file = open(self._filepath, "w") + self._verbose = verbose + self.header(desc) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def __del__(self): + self.close() + + def close(self): + if self._file is not None: + if target_platform() == "windows": + self.blankln() + self._file.write("}\n") + self._file.write("catch {\n") + self._file.write(" $_;\n") + self._file.write(" ExitWithCode 1;\n") + self._file.write("}\n") + """Close the file""" + self._file.close() + self._file = None + st = os.stat(self._filepath) + os.chmod(self._filepath, st.st_mode | stat.S_IEXEC) + + def blankln(self): + self._file.write("\n") + + def commentln(self, cnt): + self._file.write("#" * cnt + "\n") + + def comment(self, msg=""): + if not isinstance(msg, str): + try: + for m in msg: + self._file.write(f"# {msg}\n") + return + except TypeError: + pass + self._file.write(f"# {msg}\n") + + def comment_verbose(self, msg=""): + if self._verbose: + self.comment(msg) + + def header(self, desc=None): + if target_platform() != "windows": + self._file.write("#!/usr/bin/env bash\n\n") + + if desc is not None: + self.comment() + self.comment(desc) + self.comment() + self.blankln() + + self.comment("Exit script immediately if any command fails") + if target_platform() == "windows": + self._file.write("$UseStructuredOutput = $false\n") + self.blankln() + self._file.write("function ExitWithCode($exitcode) {\n") + self._file.write(" $host.SetShouldExit($exitcode)\n") + self._file.write(" exit $exitcode\n") + self._file.write("}\n") + self.blankln() + if self._verbose: + self._file.write("Set-PSDebug -Trace 1\n") + self.blankln() + self._file.write("try {\n") + else: + self._file.write("set -e\n") + if self._verbose: + self._file.write("set -x\n") + self.blankln() + + def envvar_ref(self, v): + if target_platform() == "windows": + return f"${{env:{v}}}" + return f"${{{v}}}" + + def cmd(self, clist, check_exitcode=False): + if isinstance(clist, str): + self._file.write(f"{clist}\n") + else: + for c in clist: + self._file.write(f"{c} ") + self.blankln() + + if check_exitcode: + if target_platform() == "windows": + self._file.write("if ($LASTEXITCODE -ne 0) {\n") + self._file.write( + ' Write-Output "exited with status code $LASTEXITCODE";\n' + ) + self._file.write(" ExitWithCode 1;\n") + self._file.write("}\n") + + def cwd(self, path): + if target_platform() == "windows": + self.cmd(f"Set-Location -EV Err -EA Stop {path}") + else: + self.cmd(f"cd {path}") + + def cp(self, src, dest): + if target_platform() == "windows": + self.cmd(f"Copy-Item -EV Err -EA Stop {src} -Destination {dest}") + else: + self.cmd(f"cp {src} {dest}") + + def mkdir(self, path): + if target_platform() == "windows": + self.cmd( + f"New-Item -EV Err -EA Stop -ItemType Directory -Force -Path {path}" + ) + else: + self.cmd(f"mkdir -p {pathlib.Path(path)}") + + def rmdir(self, path): + if target_platform() == "windows": + self.cmd(f"if (Test-Path -Path {path}) {{") + self.cmd(f" Remove-Item -EV Err -EA Stop -Recurse -Force {path}") + self.cmd("}") + else: + self.cmd(f"rm -fr {pathlib.Path(path)}") + + def cpdir(self, src, dest): + if target_platform() == "windows": + self.cmd(f"Copy-Item -EV Err -EA Stop -Recurse {src} -Destination {dest}") + else: + self.cmd(f"cp -r {src} {dest}") + + def tar(self, subdir, tar_filename): + if target_platform() == "windows": + fail("unsupported operation: tar") + else: + self.cmd(f"tar zcf {tar_filename} {subdir}") + + def cmake(self, args): + # Pass some additional envvars into cmake... + env_args = [] + for k in ("TRT_VERSION", "CMAKE_TOOLCHAIN_FILE", "VCPKG_TARGET_TRIPLET"): + env_args += [f'"-D{k}={self.envvar_ref(k)}"'] + self.cmd(f'cmake {" ".join(env_args)} {" ".join(args)}', check_exitcode=True) + + def makeinstall(self, target="install"): + verbose_flag = "-v" if self._verbose else "" + self.cmd( + f"cmake --build . --config {FLAGS.build_type} -j{FLAGS.build_parallel} {verbose_flag} -t {target}" + ) + + def gitclone(self, repo, tag, subdir, org): + clone_dir = subdir + if not FLAGS.no_force_clone: + self.rmdir(clone_dir) + + if target_platform() == "windows": + self.cmd(f"if (-Not (Test-Path -Path {clone_dir})) {{") + else: + self.cmd(f"if [[ ! -e {clone_dir} ]]; then") + + # FIXME [DLIS-4045 - Currently the tag starting with "pull/" is not + # working with "--repo-tag" as the option is not forwarded to the + # individual repo build correctly.] + # If 'tag' starts with "pull/" then it must be of form + # "pull/<pr>/head". We just clone at "main" and then fetch the + # reference onto a new branch we name "tritonbuildref". + if tag.startswith("pull/"): + self.cmd( + f" git clone --recursive --depth=1 {org}/{repo}.git {subdir};", + check_exitcode=True, + ) + self.cmd("}" if target_platform() == "windows" else "fi") + self.cwd(subdir) + self.cmd(f"git fetch origin {tag}:tritonbuildref", check_exitcode=True) + self.cmd(f"git checkout tritonbuildref", check_exitcode=True) + else: + self.cmd( + f" git clone --recursive --single-branch --depth=1 -b {tag} {org}/{repo}.git {subdir};", + check_exitcode=True, + ) + self.cmd("}" if target_platform() == "windows" else "fi") + + +def cmake_core_arg(name, type, value): + # Return cmake -D setting to set name=value for core build. Use + # command-line specified value if one is given. + if name in OVERRIDE_CORE_CMAKE_FLAGS: + value = OVERRIDE_CORE_CMAKE_FLAGS[name] + if type is None: + type = "" + else: + type = ":{}".format(type) + return '"-D{}{}={}"'.format(name, type, value) + + +def cmake_core_enable(name, flag): + # Return cmake -D setting to set name=flag?ON:OFF for core + # build. Use command-line specified value for 'flag' if one is + # given. + if name in OVERRIDE_CORE_CMAKE_FLAGS: + value = OVERRIDE_CORE_CMAKE_FLAGS[name] + else: + value = "ON" if flag else "OFF" + return '"-D{}:BOOL={}"'.format(name, value) + + +def cmake_core_extra_args(): + args = [] + for k, v in EXTRA_CORE_CMAKE_FLAGS.items(): + args.append('"-D{}={}"'.format(k, v)) + return args + + +def cmake_backend_arg(backend, name, type, value): + # Return cmake -D setting to set name=value for backend build. Use + # command-line specified value if one is given. + if backend in OVERRIDE_BACKEND_CMAKE_FLAGS: + if name in OVERRIDE_BACKEND_CMAKE_FLAGS[backend]: + value = OVERRIDE_BACKEND_CMAKE_FLAGS[backend][name] + if type is None: + type = "" + else: + type = ":{}".format(type) + return '"-D{}{}={}"'.format(name, type, value) + + +def cmake_backend_enable(backend, name, flag): + # Return cmake -D setting to set name=flag?ON:OFF for backend + # build. Use command-line specified value for 'flag' if one is + # given. + value = None + if backend in OVERRIDE_BACKEND_CMAKE_FLAGS: + if name in OVERRIDE_BACKEND_CMAKE_FLAGS[backend]: + value = OVERRIDE_BACKEND_CMAKE_FLAGS[backend][name] + if value is None: + value = "ON" if flag else "OFF" + return '"-D{}:BOOL={}"'.format(name, value) + + +def cmake_backend_extra_args(backend): + args = [] + if backend in EXTRA_BACKEND_CMAKE_FLAGS: + for k, v in EXTRA_BACKEND_CMAKE_FLAGS[backend].items(): + args.append('"-D{}={}"'.format(k, v)) + return args + + +def cmake_repoagent_arg(name, type, value): + # For now there is no override for repo-agents + if type is None: + type = "" + else: + type = ":{}".format(type) + return '"-D{}{}={}"'.format(name, type, value) + + +def cmake_repoagent_enable(name, flag): + # For now there is no override for repo-agents + value = "ON" if flag else "OFF" + return '"-D{}:BOOL={}"'.format(name, value) + + +def cmake_repoagent_extra_args(): + # For now there is no extra args for repo-agents + args = [] + return args + + +def cmake_cache_arg(name, type, value): + # For now there is no override for caches + if type is None: + type = "" + else: + type = ":{}".format(type) + return '"-D{}{}={}"'.format(name, type, value) + + +def cmake_cache_enable(name, flag): + # For now there is no override for caches + value = "ON" if flag else "OFF" + return '"-D{}:BOOL={}"'.format(name, value) + + +def cmake_cache_extra_args(): + # For now there is no extra args for caches + args = [] + return args + + +def core_cmake_args(components, backends, cmake_dir, install_dir): + cargs = [ + cmake_core_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type), + cmake_core_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir), + cmake_core_arg("TRITON_VERSION", "STRING", FLAGS.version), + cmake_core_arg("TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization), + cmake_core_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]), + cmake_core_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]), + cmake_core_arg("TRITON_BACKEND_REPO_TAG", "STRING", components["backend"]), + cmake_core_arg( + "TRITON_THIRD_PARTY_REPO_TAG", "STRING", components["thirdparty"] + ), + ] + + cargs.append(cmake_core_enable("TRITON_ENABLE_LOGGING", FLAGS.enable_logging)) + cargs.append(cmake_core_enable("TRITON_ENABLE_STATS", FLAGS.enable_stats)) + cargs.append(cmake_core_enable("TRITON_ENABLE_METRICS", FLAGS.enable_metrics)) + cargs.append( + cmake_core_enable("TRITON_ENABLE_METRICS_GPU", FLAGS.enable_gpu_metrics) + ) + cargs.append( + cmake_core_enable("TRITON_ENABLE_METRICS_CPU", FLAGS.enable_cpu_metrics) + ) + cargs.append(cmake_core_enable("TRITON_ENABLE_TRACING", FLAGS.enable_tracing)) + cargs.append(cmake_core_enable("TRITON_ENABLE_NVTX", FLAGS.enable_nvtx)) + + cargs.append(cmake_core_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu)) + cargs.append( + cmake_core_arg( + "TRITON_MIN_COMPUTE_CAPABILITY", None, FLAGS.min_compute_capability + ) + ) + + cargs.append(cmake_core_enable("TRITON_ENABLE_MALI_GPU", FLAGS.enable_mali_gpu)) + + cargs.append(cmake_core_enable("TRITON_ENABLE_GRPC", "grpc" in FLAGS.endpoint)) + cargs.append(cmake_core_enable("TRITON_ENABLE_HTTP", "http" in FLAGS.endpoint)) + cargs.append( + cmake_core_enable("TRITON_ENABLE_SAGEMAKER", "sagemaker" in FLAGS.endpoint) + ) + cargs.append( + cmake_core_enable("TRITON_ENABLE_VERTEX_AI", "vertex-ai" in FLAGS.endpoint) + ) + + cargs.append(cmake_core_enable("TRITON_ENABLE_GCS", "gcs" in FLAGS.filesystem)) + cargs.append(cmake_core_enable("TRITON_ENABLE_S3", "s3" in FLAGS.filesystem)) + cargs.append( + cmake_core_enable( + "TRITON_ENABLE_AZURE_STORAGE", "azure_storage" in FLAGS.filesystem + ) + ) + + cargs.append(cmake_core_enable("TRITON_ENABLE_ENSEMBLE", "ensemble" in backends)) + cargs.append(cmake_core_enable("TRITON_ENABLE_TENSORRT", "tensorrt" in backends)) + + cargs += cmake_core_extra_args() + cargs.append(cmake_dir) + return cargs + + +def repoagent_repo(ra): + return "{}_repository_agent".format(ra) + + +def repoagent_cmake_args(images, components, ra, install_dir): + args = [] + + cargs = args + [ + cmake_repoagent_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type), + cmake_repoagent_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir), + cmake_repoagent_arg( + "TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization + ), + cmake_repoagent_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]), + cmake_repoagent_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]), + ] + + cargs.append(cmake_repoagent_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu)) + cargs += cmake_repoagent_extra_args() + cargs.append("..") + return cargs + + +def cache_repo(cache): + # example: "local", or "redis" + return "{}_cache".format(cache) + + +def cache_cmake_args(images, components, cache, install_dir): + args = [] + + cargs = args + [ + cmake_cache_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type), + cmake_cache_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir), + cmake_cache_arg( + "TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization + ), + cmake_cache_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]), + cmake_cache_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]), + ] + + cargs.append(cmake_cache_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu)) + cargs += cmake_cache_extra_args() + cargs.append("..") + return cargs + + +def backend_repo(be): + return "{}_backend".format(be) + + +def backend_cmake_args(images, components, be, install_dir, library_paths): + cmake_build_type = FLAGS.build_type + + if be == "onnxruntime": + args = onnxruntime_cmake_args(images, library_paths) + elif be == "openvino": + args = openvino_cmake_args() + elif be == "tensorflow": + args = tensorflow_cmake_args(images, library_paths) + elif be == "python": + args = [] + elif be == "dali": + args = dali_cmake_args() + elif be == "pytorch": + args = pytorch_cmake_args(images) + elif be == "armnn_tflite": + args = armnn_tflite_cmake_args() + elif be == "fil": + args = fil_cmake_args(images) + # DLIS-4618: FIL backend fails debug build, so override it for now. + cmake_build_type = "Release" + elif be == "fastertransformer": + args = fastertransformer_cmake_args() + elif be == "tensorrt": + args = tensorrt_cmake_args() + elif be == "tensorrtllm": + args = tensorrtllm_cmake_args(images) + else: + args = [] + + cargs = args + [ + cmake_backend_arg(be, "CMAKE_BUILD_TYPE", None, cmake_build_type), + cmake_backend_arg(be, "CMAKE_INSTALL_PREFIX", "PATH", install_dir), + cmake_backend_arg( + be, "TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization + ), + cmake_backend_arg(be, "TRITON_COMMON_REPO_TAG", "STRING", components["common"]), + cmake_backend_arg(be, "TRITON_CORE_REPO_TAG", "STRING", components["core"]), + cmake_backend_arg( + be, "TRITON_BACKEND_REPO_TAG", "STRING", components["backend"] + ), + ] + + cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_GPU", FLAGS.enable_gpu)) + cargs.append( + cmake_backend_enable(be, "TRITON_ENABLE_MALI_GPU", FLAGS.enable_mali_gpu) + ) + cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_STATS", FLAGS.enable_stats)) + cargs.append( + cmake_backend_enable(be, "TRITON_ENABLE_METRICS", FLAGS.enable_metrics) + ) + + # [DLIS-4950] always enable below once Windows image is updated with CUPTI + # cargs.append(cmake_backend_enable(be, 'TRITON_ENABLE_MEMORY_TRACKER', True)) + if (target_platform() == "windows") and (not FLAGS.no_container_build): + print( + "Warning: Detected docker build is used for Windows, backend utility 'device memory tracker' will be disabled due to missing library in CUDA Windows docker image." + ) + cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", False)) + elif target_platform() == "igpu": + print( + "Warning: Detected iGPU build, backend utility 'device memory tracker' will be disabled as iGPU doesn't contain required version of the library." + ) + cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", False)) + elif FLAGS.enable_gpu: + cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", True)) + + cargs += cmake_backend_extra_args(be) + if be == "tensorrtllm": + cargs.append("-S ../inflight_batcher_llm -B .") + + else: + cargs.append("..") + return cargs + + +def pytorch_cmake_args(images): + if "pytorch" in images: + image = images["pytorch"] + else: + image = "nvcr.io/nvidia/pytorch:{}-py3".format(FLAGS.upstream_container_version) + cargs = [ + cmake_backend_arg("pytorch", "TRITON_PYTORCH_DOCKER_IMAGE", None, image), + ] + + # TODO: TPRD-372 TorchTRT extension is not currently supported by our manylinux build + # TODO: TPRD-373 NVTX extension is not currently supported by our manylinux build + if target_platform() != "rhel": + if FLAGS.enable_gpu: + cargs.append( + cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True) + ) + cargs.append( + cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx) + ) + return cargs + + +def onnxruntime_cmake_args(images, library_paths): + cargs = [ + cmake_backend_arg( + "onnxruntime", + "TRITON_BUILD_ONNXRUNTIME_VERSION", + None, + os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION") + if os.getenv("TRITON_BUILD_ONNXRUNTIME_VERSION") + else TRITON_VERSION_MAP[FLAGS.version][2], + ) + ] + + # TRITON_ENABLE_GPU is already set for all backends in backend_cmake_args() + # TODO: TPRD-334 TensorRT extension is not currently supported by our manylinux build + if FLAGS.enable_gpu and target_platform() != "rhel": + cargs.append( + cmake_backend_enable( + "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_TENSORRT", True + ) + ) + + if target_platform() == "windows": + if "base" in images: + cargs.append( + cmake_backend_arg( + "onnxruntime", "TRITON_BUILD_CONTAINER", None, images["base"] + ) + ) + else: + if "base" in images: + cargs.append( + cmake_backend_arg( + "onnxruntime", "TRITON_BUILD_CONTAINER", None, images["base"] + ) + ) + else: + cargs.append( + cmake_backend_arg( + "onnxruntime", + "TRITON_BUILD_CONTAINER_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][1], + ) + ) + + # TODO: TPRD-333 OpenVino extension is not currently supported by our manylinux build + if ( + (target_machine() != "aarch64") + and (target_platform() != "rhel") + and (TRITON_VERSION_MAP[FLAGS.version][3] is not None) + ): + cargs.append( + cmake_backend_enable( + "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_OPENVINO", True + ) + ) + cargs.append( + cmake_backend_arg( + "onnxruntime", + "TRITON_BUILD_ONNXRUNTIME_OPENVINO_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][3], + ) + ) + + if (target_platform() == "igpu") or (target_platform() == "rhel"): + cargs.append( + cmake_backend_arg( + "onnxruntime", + "TRITON_BUILD_TARGET_PLATFORM", + None, + target_platform(), + ) + ) + + return cargs + + +def openvino_cmake_args(): + cargs = [ + cmake_backend_arg( + "openvino", + "TRITON_BUILD_OPENVINO_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][4], + ) + ] + if target_platform() == "windows": + if "base" in images: + cargs.append( + cmake_backend_arg( + "openvino", "TRITON_BUILD_CONTAINER", None, images["base"] + ) + ) + else: + if "base" in images: + cargs.append( + cmake_backend_arg( + "openvino", "TRITON_BUILD_CONTAINER", None, images["base"] + ) + ) + else: + cargs.append( + cmake_backend_arg( + "openvino", + "TRITON_BUILD_CONTAINER_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][1], + ) + ) + return cargs + + +def tensorrt_cmake_args(): + cargs = [ + cmake_backend_enable("tensorrt", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx), + ] + if target_platform() == "windows": + cargs.append( + cmake_backend_arg( + "tensorrt", "TRITON_TENSORRT_INCLUDE_PATHS", None, "c:/TensorRT/include" + ) + ) + + return cargs + + +def tensorflow_cmake_args(images, library_paths): + backend_name = "tensorflow" + extra_args = [] + + # If a specific TF image is specified use it, otherwise pull from NGC. + if backend_name in images: + image = images[backend_name] + else: + image = "nvcr.io/nvidia/tensorflow:{}-tf2-py3".format( + FLAGS.upstream_container_version + ) + extra_args = [ + cmake_backend_arg(backend_name, "TRITON_TENSORFLOW_DOCKER_IMAGE", None, image) + ] + return extra_args + + +def dali_cmake_args(): + return [ + cmake_backend_enable("dali", "TRITON_DALI_SKIP_DOWNLOAD", False), + ] + + +def fil_cmake_args(images): + cargs = [cmake_backend_enable("fil", "TRITON_FIL_DOCKER_BUILD", True)] + if "base" in images: + cargs.append( + cmake_backend_arg("fil", "TRITON_BUILD_CONTAINER", None, images["base"]) + ) + else: + cargs.append( + cmake_backend_arg( + "fil", + "TRITON_BUILD_CONTAINER_VERSION", + None, + TRITON_VERSION_MAP[FLAGS.version][1], + ) + ) + + return cargs + + +def armnn_tflite_cmake_args(): + return [ + cmake_backend_arg("armnn_tflite", "JOBS", None, multiprocessing.cpu_count()), + ] + + +def fastertransformer_cmake_args(): + print("Warning: FasterTransformer backend is not officially supported.") + cargs = [ + cmake_backend_arg( + "fastertransformer", "CMAKE_EXPORT_COMPILE_COMMANDS", None, 1 + ), + cmake_backend_arg("fastertransformer", "ENABLE_FP8", None, "OFF"), + ] + return cargs + + +def tensorrtllm_cmake_args(images): + cargs = [] + cargs.append(cmake_backend_enable("tensorrtllm", "USE_CXX11_ABI", True)) + return cargs + + +def install_dcgm_libraries(dcgm_version, target_machine): + if dcgm_version == "": + fail( + "unable to determine default repo-tag, DCGM version not known for {}".format( + FLAGS.version + ) + ) + return "" + else: + # RHEL has the same install instructions for both aarch64 and x86 + if target_platform() == "rhel": + if target_machine == "aarch64": + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\ + && dnf clean expire-cache \\ + && dnf install -y datacenter-gpu-manager-{} +""".format( + dcgm_version, dcgm_version + ) + else: + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\ + && dnf clean expire-cache \\ + && dnf install -y datacenter-gpu-manager-{} +""".format( + dcgm_version, dcgm_version + ) + else: + if target_machine == "aarch64": + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN curl -o /tmp/cuda-keyring.deb \\ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa/cuda-keyring_1.0-1_all.deb \\ + && apt install /tmp/cuda-keyring.deb \\ + && rm /tmp/cuda-keyring.deb \\ + && apt-get update \\ + && apt-get install -y datacenter-gpu-manager=1:{} +""".format( + dcgm_version, dcgm_version + ) + else: + return """ +ENV DCGM_VERSION {} +# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads +RUN curl -o /tmp/cuda-keyring.deb \\ + https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \\ + && apt install /tmp/cuda-keyring.deb \\ + && rm /tmp/cuda-keyring.deb \\ + && apt-get update \\ + && apt-get install -y datacenter-gpu-manager=1:{} +""".format( + dcgm_version, dcgm_version + ) + + +def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap): + df = """ +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} +ARG BASE_IMAGE={} +""".format( + argmap["TRITON_VERSION"], + argmap["TRITON_CONTAINER_VERSION"], + argmap["BASE_IMAGE"], + ) + + df += """ +FROM ${BASE_IMAGE} + +ARG TRITON_VERSION +ARG TRITON_CONTAINER_VERSION +""" + df += """ +# Install docker docker buildx +RUN yum install -y ca-certificates curl gnupg yum-utils \\ + && yum-config-manager --add-repo https://download.docker.com/linux/rhel/docker-ce.repo \\ + && yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +# && yum install -y docker.io docker-buildx-plugin + +# libcurl4-openSSL-dev is needed for GCS +# python3-dev is needed by Torchvision +# python3-pip and libarchive-dev is needed by python backend +# libxml2-dev is needed for Azure Storage +# scons is needed for armnn_tflite backend build dep +RUN yum install -y \\ + ca-certificates \\ + autoconf \\ + automake \\ + git \\ + gperf \\ + re2-devel \\ + openssl-devel \\ + libtool \\ + libcurl-devel \\ + libb64-devel \\ + gperftools-devel \\ + patchelf \\ + python3.11-devel \\ + python3-pip \\ + python3-setuptools \\ + rapidjson-devel \\ + python3-scons \\ + pkg-config \\ + unzip \\ + wget \\ + zlib-devel \\ + libarchive-devel \\ + libxml2-devel \\ + numactl-devel \\ + wget + +RUN pip3 install --upgrade pip \\ + && pip3 install --upgrade \\ + wheel \\ + setuptools \\ + docker \\ + virtualenv + +# Install boost version >= 1.78 for boost::span +# Current libboost-dev apt packages are < 1.78, so install from tar.gz +RUN wget -O /tmp/boost.tar.gz \\ + https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz \\ + && (cd /tmp && tar xzf boost.tar.gz) \\ + && mv /tmp/boost_1_80_0/boost /usr/include/boost + +# Server build requires recent version of CMake (FetchContent required) +# Might not need this if the installed version of cmake is high enough for our build. +# RUN apt update -q=2 \\ +# && apt install -y gpg wget \\ +# && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \\ +# && . /etc/os-release \\ +# && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \\ +# && apt-get update -q=2 \\ +# && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* +""" + if FLAGS.enable_gpu: + df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine()) + df += """ +ENV TRITON_SERVER_VERSION ${TRITON_VERSION} +ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION} +""" + + df += """ +WORKDIR /workspace +RUN rm -fr * +COPY . . +ENTRYPOINT [] +""" + + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) + + +def create_dockerfile_buildbase(ddir, dockerfile_name, argmap): + df = """ +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} +ARG BASE_IMAGE={} +""".format( + argmap["TRITON_VERSION"], + argmap["TRITON_CONTAINER_VERSION"], + argmap["BASE_IMAGE"], + ) + + df += """ +FROM ${BASE_IMAGE} + +ARG TRITON_VERSION +ARG TRITON_CONTAINER_VERSION +""" + # Install the windows- or linux-specific buildbase dependencies + if target_platform() == "windows": + df += """ +SHELL ["cmd", "/S", "/C"] +""" + else: + df += """ +# Ensure apt-get won't prompt for selecting options +ENV DEBIAN_FRONTEND=noninteractive + +# Install docker docker buildx +RUN apt-get update \\ + && apt-get install -y ca-certificates curl gnupg \\ + && install -m 0755 -d /etc/apt/keyrings \\ + && curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \\ + && chmod a+r /etc/apt/keyrings/docker.gpg \\ + && echo \\ + "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \\ + "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \\ + tee /etc/apt/sources.list.d/docker.list > /dev/null \\ + && apt-get update \\ + && apt-get install -y docker.io docker-buildx-plugin + +# libcurl4-openSSL-dev is needed for GCS +# python3-dev is needed by Torchvision +# python3-pip and libarchive-dev is needed by python backend +# libxml2-dev is needed for Azure Storage +# scons is needed for armnn_tflite backend build dep +RUN apt-get update \\ + && apt-get install -y --no-install-recommends \\ + ca-certificates \\ + autoconf \\ + automake \\ + build-essential \\ + git \\ + gperf \\ + libre2-dev \\ + libssl-dev \\ + libtool \\ + libcurl4-openssl-dev \\ + libb64-dev \\ + libgoogle-perftools-dev \\ + patchelf \\ + python3-dev \\ + python3-pip \\ + python3-setuptools \\ + rapidjson-dev \\ + scons \\ + software-properties-common \\ + pkg-config \\ + unzip \\ + wget \\ + zlib1g-dev \\ + libarchive-dev \\ + libxml2-dev \\ + libnuma-dev \\ + wget \\ + && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --upgrade pip \\ + && pip3 install --upgrade \\ + wheel \\ + setuptools \\ + docker \\ + virtualenv + +# Install boost version >= 1.78 for boost::span +# Current libboost-dev apt packages are < 1.78, so install from tar.gz +RUN wget -O /tmp/boost.tar.gz \\ + https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz \\ + && (cd /tmp && tar xzf boost.tar.gz) \\ + && mv /tmp/boost_1_80_0/boost /usr/include/boost + +# Server build requires recent version of CMake (FetchContent required) +RUN apt update -q=2 \\ + && apt install -y gpg wget \\ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \\ + && . /etc/os-release \\ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \\ + && apt-get update -q=2 \\ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* +""" + + if FLAGS.enable_gpu: + df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine()) + + df += """ +ENV TRITON_SERVER_VERSION ${TRITON_VERSION} +ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION} +""" + + # Copy in the triton source. We remove existing contents first in + # case the FROM container has something there already. + if target_platform() == "windows": + df += """ +WORKDIR /workspace +RUN rmdir /S/Q * || exit 0 +COPY . . +""" + else: + df += """ +WORKDIR /workspace +RUN rm -fr * +COPY . . +ENTRYPOINT [] +""" + + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) + + +def create_dockerfile_cibase(ddir, dockerfile_name, argmap): + df = """ +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} +ARG BASE_IMAGE={} +""".format( + argmap["TRITON_VERSION"], + argmap["TRITON_CONTAINER_VERSION"], + argmap["BASE_IMAGE"], + ) + + df += """ +FROM ${BASE_IMAGE} + +ARG TRITON_VERSION +ARG TRITON_CONTAINER_VERSION + +COPY build/ci /workspace + +WORKDIR /workspace + +ENV TRITON_SERVER_VERSION ${TRITON_VERSION} +ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION} +""" + + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) + + +def create_dockerfile_linux( + ddir, dockerfile_name, argmap, backends, repoagents, caches, endpoints +): + df = """ +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} +ARG BASE_IMAGE={} + +""".format( + argmap["TRITON_VERSION"], + argmap["TRITON_CONTAINER_VERSION"], + argmap["BASE_IMAGE"], + ) + + # PyTorch and TensorFlow backends need extra CUDA and other + # dependencies during runtime that are missing in the CPU-only base container. + # These dependencies must be copied from the Triton Min image. + if not FLAGS.enable_gpu and (("pytorch" in backends) or ("tensorflow" in backends)): + df += """ +############################################################################ +## Triton Min image +############################################################################ +FROM {} AS min_container + +""".format( + argmap["GPU_BASE_IMAGE"] + ) + + df += """ +############################################################################ +## Production stage: Create container with just inference server executable +############################################################################ +FROM ${BASE_IMAGE} +""" + + df += dockerfile_prepare_container_linux( + argmap, backends, FLAGS.enable_gpu, target_machine() + ) + + df += """ +WORKDIR /opt +COPY --chown=1000:1000 build/install tritonserver + +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 NVIDIA_Deep_Learning_Container_License.pdf . + +""" + if not FLAGS.no_core_build: + # Add feature labels for SageMaker endpoint + if "sagemaker" in endpoints: + df += """ +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +LABEL com.amazonaws.sagemaker.capabilities.multi-models=true +COPY --chown=1000:1000 docker/sagemaker/serve /usr/bin/. +""" + + # This is required since libcublasLt.so is not present during the build + # stage of the PyTorch backend + if not FLAGS.enable_gpu and ("pytorch" in backends): + df += """ +RUN patchelf --add-needed /usr/local/cuda/lib64/stubs/libcublasLt.so.12 backends/pytorch/libtorch_cuda.so +""" + if "tensorrtllm" in backends: + df += """ +# Install required packages for TRT-LLM models +# Remove contents that are not needed in runtime +# Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1 +# The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0 +RUN ldconfig && \\ + ARCH="$(uname -i)" && \\ + rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \\ + rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \\ + rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \\ + python3 -m pip install --upgrade pip && \\ + pip3 install --no-cache-dir transformers && \\ + find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \\ + find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \\ + pip3 install --no-cache-dir grpcio-tools==1.64.0 && \\ + pip3 uninstall -y setuptools +ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH + +# There are some ucc issues when spawning mpi processes with ompi v4.1.7a1. +# Downgrade to ompi v4.1.5rc2 to avoid the issue. +RUN rm -fr /opt/hpcx/ompi +COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi +""" + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) + + +def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_machine): + gpu_enabled = 1 if enable_gpu else 0 + # Common steps to produce docker images shared by build.py and compose.py. + # Sets environment variables, installs dependencies and adds entrypoint + df = """ +ARG TRITON_VERSION +ARG TRITON_CONTAINER_VERSION + +ENV TRITON_SERVER_VERSION ${TRITON_VERSION} +ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION} +LABEL com.nvidia.tritonserver.version="${TRITON_SERVER_VERSION}" + +ENV PATH /opt/tritonserver/bin:${PATH} +# Remove once https://github.com/openucx/ucx/pull/9148 is available +# in the min container. +ENV UCX_MEM_EVENTS no +""" + + # Necessary for libtorch.so to find correct HPCX libraries + if "pytorch" in backends: + df += """ +ENV LD_LIBRARY_PATH /opt/hpcx/ucc/lib/:/opt/hpcx/ucx/lib/:${LD_LIBRARY_PATH} +""" + + backend_dependencies = "" + # libgomp1 is needed by both onnxruntime and pytorch backends + if ("onnxruntime" in backends) or ("pytorch" in backends): + backend_dependencies = "libgomp1" + + # libgfortran5 is needed by pytorch backend on ARM + if ("pytorch" in backends) and (target_machine == "aarch64"): + backend_dependencies += " libgfortran5" + # openssh-server is needed for fastertransformer + if "fastertransformer" in backends: + backend_dependencies += " openssh-server" + + df += """ +ENV TF_ADJUST_HUE_FUSED 1 +ENV TF_ADJUST_SATURATION_FUSED 1 +ENV TF_ENABLE_WINOGRAD_NONFUSED 1 +ENV TF_AUTOTUNE_THRESHOLD 2 +ENV TRITON_SERVER_GPU_ENABLED {gpu_enabled} + +# Create a user that can be used to run triton as +# non-root. Make sure that this user to given ID 1000. All server +# artifacts copied below are assign to this user. +ENV TRITON_SERVER_USER=triton-server +RUN userdel tensorrt-server > /dev/null 2>&1 || true \\ + && if ! id -u $TRITON_SERVER_USER > /dev/null 2>&1 ; then \\ + useradd $TRITON_SERVER_USER; \\ + fi \\ + && [ `id -u $TRITON_SERVER_USER` -eq 1000 ] \\ + && [ `id -g $TRITON_SERVER_USER` -eq 1000 ] +""".format( + gpu_enabled=gpu_enabled + ) + + if target_platform() == "rhel": + df += """ +# Common dpeendencies. +RUN yum install -y \\ + git \\ + gperf \\ + re2-devel \\ + openssl-devel \\ + libtool \\ + libcurl-devel \\ + libb64-devel \\ + gperftools-devel \\ + patchelf \\ + wget \\ + numactl-devel +""" + else: + df += """ +# Ensure apt-get won't prompt for selecting options +ENV DEBIAN_FRONTEND=noninteractive + +# Common dependencies. FIXME (can any of these be conditional? For +# example libcurl only needed for GCS?) +RUN apt-get update \\ + && apt-get install -y --no-install-recommends \\ + clang \\ + curl \\ + dirmngr \\ + git \\ + gperf \\ + libb64-0d \\ + libcurl4-openssl-dev \\ + libgoogle-perftools-dev \\ + libjemalloc-dev \\ + libnuma-dev \\ + libre2-9 \\ + software-properties-common \\ + wget \\ + {backend_dependencies} \\ + && rm -rf /var/lib/apt/lists/* +""".format( + backend_dependencies=backend_dependencies + ) + + df += """ +# Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc +ENV TCMALLOC_RELEASE_RATE 200 +""" + + if "fastertransformer" in backends: + be = "fastertransformer" + url = "https://raw.githubusercontent.com/triton-inference-server/fastertransformer_backend/{}/docker/create_dockerfile_and_build.py".format( + backends[be] + ) + response = requests.get(url) + spec = importlib.util.spec_from_loader( + "fastertransformer_buildscript", loader=None, origin=url + ) + fastertransformer_buildscript = importlib.util.module_from_spec(spec) + exec(response.content, fastertransformer_buildscript.__dict__) + df += fastertransformer_buildscript.create_postbuild(is_multistage_build=False) + + if enable_gpu: + df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine) + # This segment will break the RHEL SBSA build. Need to determine whether + # this is necessary to incorporate. + if target_platform() != "rhel": + df += """ +# Extra defensive wiring for CUDA Compat lib +RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \\ + && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\ + && ldconfig \\ + && rm -f ${_CUDA_COMPAT_PATH}/lib +""" + else: + df += add_cpu_libs_to_linux_dockerfile(backends, target_machine) + + # Add dependencies needed for python backend + if "python" in backends: + df += """ +# python3, python3-pip and some pip installs required for the python backend +RUN apt-get update \\ + && apt-get install -y --no-install-recommends \\ + python3 \\ + libarchive-dev \\ + python3-pip \\ + libpython3-dev \\ + && pip3 install --upgrade pip \\ + && pip3 install --upgrade \\ + wheel \\ + setuptools \\ + \"numpy<2\" \\ + virtualenv \\ + && rm -rf /var/lib/apt/lists/* +""" + if "tensorrtllm" in backends: + df += """ +# Updating the openssh-client to fix for the CVE-2024-6387. This can be removed when trtllm uses a later CUDA container(12.5 or later) +RUN apt-get update \\ + && apt-get install -y --no-install-recommends \\ + openssh-client \\ + && rm -rf /var/lib/apt/lists/* + """ + + if "vllm" in backends: + df += """ +# vLLM needed for vLLM backend +RUN pip3 install vllm=={} +""".format( + TRITON_VERSION_MAP[FLAGS.version][6] + ) + + if "dali" in backends: + df += """ +# Update Python path to include DALI +ENV PYTHONPATH=/opt/tritonserver/backends/dali/wheel/dali:$PYTHONPATH +""" + + df += """ +WORKDIR /opt/tritonserver +RUN rm -fr /opt/tritonserver/* +ENV NVIDIA_PRODUCT_NAME="Triton Server" +COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/ +""" + + # The CPU-only build uses ubuntu as the base image, and so the + # entrypoint files are not available in /opt/nvidia in the base + # image, so we must provide them ourselves. + if not enable_gpu: + df += """ +COPY docker/cpu_only/ /opt/nvidia/ +ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"] +""" + + df += """ +ENV NVIDIA_BUILD_ID {} +LABEL com.nvidia.build.id={} +LABEL com.nvidia.build.ref={} +""".format( + argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_REF"] + ) + + return df + + +def add_cpu_libs_to_linux_dockerfile(backends, target_machine): + df = "" + libs_arch = "aarch64" if target_machine == "aarch64" else "x86_64" + if "pytorch" in backends: + # Add extra dependencies for pytorch backend. + # Note: Even though the build is CPU-only, the version of pytorch + # we are using depend upon libraries like cuda and cudnn. Since + # these dependencies are not present in the ubuntu base image, + # we must copy these from the Triton min container ourselves. + cuda_arch = "sbsa" if target_machine == "aarch64" else "x86_64" + df += """ +RUN mkdir -p /usr/local/cuda/lib64/stubs +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusparse.so /usr/local/cuda/lib64/stubs/libcusparse.so.12 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusolver.so /usr/local/cuda/lib64/stubs/libcusolver.so.11 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcurand.so /usr/local/cuda/lib64/stubs/libcurand.so.10 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcufft.so /usr/local/cuda/lib64/stubs/libcufft.so.11 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublas.so /usr/local/cuda/lib64/stubs/libcublas.so.12 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.12 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.11 + +RUN mkdir -p /usr/local/cuda/targets/{cuda_arch}-linux/lib +COPY --from=min_container /usr/local/cuda/lib64/libcudart.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. +COPY --from=min_container /usr/local/cuda/lib64/libcupti.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. +COPY --from=min_container /usr/local/cuda/lib64/libnvToolsExt.so.1 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. +COPY --from=min_container /usr/local/cuda/lib64/libnvJitLink.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. + +RUN mkdir -p /opt/hpcx/ucc/lib/ /opt/hpcx/ucx/lib/ +COPY --from=min_container /opt/hpcx/ucc/lib/libucc.so.1 /opt/hpcx/ucc/lib/libucc.so.1 +COPY --from=min_container /opt/hpcx/ucx/lib/libucm.so.0 /opt/hpcx/ucx/lib/libucm.so.0 +COPY --from=min_container /opt/hpcx/ucx/lib/libucp.so.0 /opt/hpcx/ucx/lib/libucp.so.0 +COPY --from=min_container /opt/hpcx/ucx/lib/libucs.so.0 /opt/hpcx/ucx/lib/libucs.so.0 +COPY --from=min_container /opt/hpcx/ucx/lib/libuct.so.0 /opt/hpcx/ucx/lib/libuct.so.0 + +COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 + +# patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so +RUN apt-get update \\ + && apt-get install -y --no-install-recommends openmpi-bin patchelf + +ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}} +""".format( + cuda_arch=cuda_arch, libs_arch=libs_arch + ) + + if ("pytorch" in backends) or ("tensorflow" in backends): + # Add NCCL dependency for tensorflow/pytorch backend. + # Note: Even though the build is CPU-only, the version of + # tensorflow/pytorch we are using depends upon the NCCL library. + # Since this dependency is not present in the ubuntu base image, + # we must copy it from the Triton min container ourselves. + df += """ +COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libnccl.so.2 /usr/lib/{libs_arch}-linux-gnu/libnccl.so.2 +""".format( + libs_arch=libs_arch + ) + + return df + + +def create_dockerfile_windows( + ddir, dockerfile_name, argmap, backends, repoagents, caches +): + df = """ +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} +ARG BASE_IMAGE={} + +############################################################################ +## Production stage: Create container with just inference server executable +############################################################################ +FROM ${{BASE_IMAGE}} + +ARG TRITON_VERSION +ARG TRITON_CONTAINER_VERSION + +ENV TRITON_SERVER_VERSION ${{TRITON_VERSION}} +ENV NVIDIA_TRITON_SERVER_VERSION ${{TRITON_CONTAINER_VERSION}} +LABEL com.nvidia.tritonserver.version="${{TRITON_SERVER_VERSION}}" + +RUN setx path "%path%;C:\opt\tritonserver\bin" + +""".format( + argmap["TRITON_VERSION"], + argmap["TRITON_CONTAINER_VERSION"], + argmap["BASE_IMAGE"], + ) + df += """ +WORKDIR /opt +RUN rmdir /S/Q tritonserver || exit 0 +COPY --chown=1000:1000 build/install tritonserver + +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 NVIDIA_Deep_Learning_Container_License.pdf . + +""" + df += """ +ENTRYPOINT [] +ENV NVIDIA_BUILD_ID {} +LABEL com.nvidia.build.id={} +LABEL com.nvidia.build.ref={} +""".format( + argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_REF"] + ) + + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) + + +def create_build_dockerfiles( + container_build_dir, images, backends, repoagents, caches, endpoints +): + if "base" in images: + base_image = images["base"] + elif target_platform() == "windows": + base_image = "mcr.microsoft.com/dotnet/framework/sdk:4.8" + elif FLAGS.enable_gpu: + base_image = "nvcr.io/nvidia/tritonserver:{}-py3-min".format( + FLAGS.upstream_container_version + ) + else: + base_image = "ubuntu:22.04" + + dockerfileargmap = { + "NVIDIA_BUILD_REF": "" if FLAGS.build_sha is None else FLAGS.build_sha, + "NVIDIA_BUILD_ID": "<unknown>" if FLAGS.build_id is None else FLAGS.build_id, + "TRITON_VERSION": FLAGS.version, + "TRITON_CONTAINER_VERSION": FLAGS.container_version, + "BASE_IMAGE": base_image, + "DCGM_VERSION": "" + if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP + else TRITON_VERSION_MAP[FLAGS.version][5], + } + + # For CPU-only image we need to copy some cuda libraries and dependencies + # since we are using PyTorch and TensorFlow containers that + # are not CPU-only. + if ( + not FLAGS.enable_gpu + and (("pytorch" in backends) or ("tensorflow" in backends)) + and (target_platform() != "windows") + ): + if "gpu-base" in images: + gpu_base_image = images["gpu-base"] + else: + gpu_base_image = "nvcr.io/nvidia/tritonserver:{}-py3-min".format( + FLAGS.upstream_container_version + ) + dockerfileargmap["GPU_BASE_IMAGE"] = gpu_base_image + + if target_platform() == "rhel": + create_dockerfile_buildbase_rhel( + FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap + ) + else: + create_dockerfile_buildbase( + FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap + ) + + if target_platform() == "windows": + create_dockerfile_windows( + FLAGS.build_dir, + "Dockerfile", + dockerfileargmap, + backends, + repoagents, + caches, + ) + else: + create_dockerfile_linux( + FLAGS.build_dir, + "Dockerfile", + dockerfileargmap, + backends, + repoagents, + caches, + endpoints, + ) + + # Dockerfile used for the creating the CI base image. + create_dockerfile_cibase(FLAGS.build_dir, "Dockerfile.cibase", dockerfileargmap) + + +def create_docker_build_script(script_name, container_install_dir, container_ci_dir): + with BuildScript( + os.path.join(FLAGS.build_dir, script_name), + verbose=FLAGS.verbose, + desc=("Docker-based build script for Triton Inference Server"), + ) as docker_script: + # + # Build base image... tritonserver_buildbase + # + docker_script.commentln(8) + docker_script.comment("Create Triton base build image") + docker_script.comment( + "This image contains all dependencies necessary to build Triton" + ) + docker_script.comment() + + cachefrommap = [ + "tritonserver_buildbase", + "tritonserver_buildbase_cache0", + "tritonserver_buildbase_cache1", + ] + + baseargs = [ + "docker", + "build", + "-t", + "tritonserver_buildbase", + "-f", + os.path.join(FLAGS.build_dir, "Dockerfile.buildbase"), + ] + + if not FLAGS.no_container_pull: + baseargs += [ + "--pull", + ] + + # Windows docker runs in a VM and memory needs to be specified + # explicitly (at least for some configurations of docker). + if target_platform() == "windows": + if FLAGS.container_memory: + baseargs += ["--memory", FLAGS.container_memory] + + baseargs += ["--cache-from={}".format(k) for k in cachefrommap] + baseargs += ["."] + + docker_script.cwd(THIS_SCRIPT_DIR) + docker_script.cmd(baseargs, check_exitcode=True) + + # + # Build... + # + docker_script.blankln() + docker_script.commentln(8) + docker_script.comment("Run build in tritonserver_buildbase container") + docker_script.comment("Mount a directory into the container where the install") + docker_script.comment("artifacts will be placed.") + docker_script.comment() + + # Don't use '-v' to communicate the built artifacts out of the + # build, because we want this code to work even if run within + # Docker (i.e. docker-in-docker) and not just if run directly + # from host. + runargs = [ + "docker", + "run", + "-w", + "/workspace/build", + "--name", + "tritonserver_builder", + ] + + if not FLAGS.no_container_interactive: + runargs += ["-it"] + + if target_platform() == "windows": + if FLAGS.container_memory: + runargs += ["--memory", FLAGS.container_memory] + runargs += ["-v", "\\\\.\pipe\docker_engine:\\\\.\pipe\docker_engine"] + else: + runargs += ["-v", "/var/run/docker.sock:/var/run/docker.sock"] + + runargs += ["tritonserver_buildbase"] + + if target_platform() == "windows": + runargs += ["powershell.exe", "-noexit", "-File", "./cmake_build.ps1"] + else: + runargs += ["./cmake_build"] + + # Remove existing tritonserver_builder container... + if target_platform() == "windows": + docker_script.cmd(["docker", "rm", "tritonserver_builder"]) + else: + docker_script._file.write( + 'if [ "$(docker ps -a | grep tritonserver_builder)" ]; then docker rm -f tritonserver_builder; fi\n' + ) + + docker_script.cmd(runargs, check_exitcode=True) + + docker_script.cmd( + [ + "docker", + "cp", + "tritonserver_builder:/tmp/tritonbuild/install", + FLAGS.build_dir, + ], + check_exitcode=True, + ) + docker_script.cmd( + [ + "docker", + "cp", + "tritonserver_builder:/tmp/tritonbuild/ci", + FLAGS.build_dir, + ], + check_exitcode=True, + ) + + # + # Final image... tritonserver + # + docker_script.blankln() + docker_script.commentln(8) + docker_script.comment("Create final tritonserver image") + docker_script.comment() + + finalargs = [ + "docker", + "build", + "-t", + "tritonserver", + "-f", + os.path.join(FLAGS.build_dir, "Dockerfile"), + ".", + ] + + docker_script.cwd(THIS_SCRIPT_DIR) + docker_script.cmd(finalargs, check_exitcode=True) + + # + # CI base image... tritonserver_cibase + # + docker_script.blankln() + docker_script.commentln(8) + docker_script.comment("Create CI base image") + docker_script.comment() + + cibaseargs = [ + "docker", + "build", + "-t", + "tritonserver_cibase", + "-f", + os.path.join(FLAGS.build_dir, "Dockerfile.cibase"), + ".", + ] + + docker_script.cwd(THIS_SCRIPT_DIR) + docker_script.cmd(cibaseargs, check_exitcode=True) + + +def core_build( + cmake_script, repo_dir, cmake_dir, build_dir, install_dir, components, backends +): + repo_build_dir = os.path.join(build_dir, "tritonserver", "build") + repo_install_dir = os.path.join(build_dir, "tritonserver", "install") + + cmake_script.commentln(8) + cmake_script.comment("Triton core library and tritonserver executable") + cmake_script.comment() + cmake_script.mkdir(repo_build_dir) + cmake_script.cwd(repo_build_dir) + cmake_script.cmake( + core_cmake_args(components, backends, cmake_dir, repo_install_dir) + ) + cmake_script.makeinstall() + + if target_platform() == "windows": + cmake_script.mkdir(os.path.join(install_dir, "bin")) + cmake_script.cp( + os.path.join(repo_install_dir, "bin", "tritonserver.exe"), + os.path.join(install_dir, "bin"), + ) + cmake_script.cp( + os.path.join(repo_install_dir, "bin", "tritonserver.dll"), + os.path.join(install_dir, "bin"), + ) + cmake_script.cp( + os.path.join(repo_install_dir, "lib", "tritonserver.lib"), + os.path.join(install_dir, "bin"), + ) + elif target_platform() == "rhel": + cmake_script.mkdir(os.path.join(install_dir, "bin")) + cmake_script.cp( + os.path.join(repo_install_dir, "bin", "tritonserver"), + os.path.join(install_dir, "bin"), + ) + cmake_script.mkdir(os.path.join(install_dir, "lib64")) + cmake_script.cp( + os.path.join(repo_install_dir, "lib64", "libtritonserver.so"), + os.path.join(install_dir, "lib64"), + ) + else: + cmake_script.mkdir(os.path.join(install_dir, "bin")) + cmake_script.cp( + os.path.join(repo_install_dir, "bin", "tritonserver"), + os.path.join(install_dir, "bin"), + ) + cmake_script.mkdir(os.path.join(install_dir, "lib")) + cmake_script.cp( + os.path.join(repo_install_dir, "lib", "libtritonserver.so"), + os.path.join(install_dir, "lib"), + ) + # [FIXME] Placing the Triton server wheel file in 'python' for now, should + # have been upload to pip registry and be able to install directly + cmake_script.mkdir(os.path.join(install_dir, "python")) + cmake_script.cp( + os.path.join(repo_install_dir, "python", "tritonserver*.whl"), + os.path.join(install_dir, "python"), + ) + + cmake_script.mkdir(os.path.join(install_dir, "include", "triton")) + cmake_script.cpdir( + os.path.join(repo_install_dir, "include", "triton", "core"), + os.path.join(install_dir, "include", "triton", "core"), + ) + + cmake_script.cp(os.path.join(repo_dir, "LICENSE"), install_dir) + cmake_script.cp(os.path.join(repo_dir, "TRITON_VERSION"), install_dir) + + # If requested, package the source code for all OSS used to build + # For windows, Triton is not delivered as a container so skip for + # windows platform. + if target_platform() != "windows": + if ( + (not FLAGS.no_container_build) + and (not FLAGS.no_core_build) + and (not FLAGS.no_container_source) + ): + cmake_script.mkdir(os.path.join(install_dir, "third-party-src")) + cmake_script.cwd(repo_build_dir) + cmake_script.tar( + "third-party-src", + os.path.join(install_dir, "third-party-src", "src.tar.gz"), + ) + cmake_script.cp( + os.path.join(repo_dir, "docker", "README.third-party-src"), + os.path.join(install_dir, "third-party-src", "README"), + ) + + cmake_script.comment() + cmake_script.comment("end Triton core library and tritonserver executable") + cmake_script.commentln(8) + cmake_script.blankln() + + +def tensorrtllm_prebuild(cmake_script): + # Export the TRT_ROOT environment variable + cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt") + cmake_script.cmd("export ARCH=$(uname -m)") + cmake_script.cmd( + 'export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib.real:${LD_LIBRARY_PATH}"' + ) + + +def tensorrtllm_postbuild(cmake_script, repo_install_dir, tensorrtllm_be_dir): + # TODO: Update the CMakeLists.txt of TRT-LLM backend to install the artifacts to the correct location + cmake_destination_dir = os.path.join(repo_install_dir, "backends/tensorrtllm") + cmake_script.mkdir(cmake_destination_dir) + + # Copy over the TRT-LLM backend libraries + cmake_script.cp( + os.path.join(tensorrtllm_be_dir, "build", "libtriton_tensorrtllm*.so"), + cmake_destination_dir, + ) + cmake_script.cp( + os.path.join(tensorrtllm_be_dir, "build", "trtllmExecutorWorker"), + cmake_destination_dir, + ) + + +def backend_build( + be, + cmake_script, + tag, + build_dir, + install_dir, + github_organization, + images, + components, + library_paths, +): + repo_build_dir = os.path.join(build_dir, be, "build") + repo_install_dir = os.path.join(build_dir, be, "install") + + cmake_script.commentln(8) + cmake_script.comment(f"'{be}' backend") + cmake_script.comment("Delete this section to remove backend from build") + cmake_script.comment() + cmake_script.mkdir(build_dir) + cmake_script.cwd(build_dir) + cmake_script.gitclone(backend_repo(be), tag, be, github_organization) + + if be == "tensorrtllm": + tensorrtllm_prebuild(cmake_script) + + cmake_script.mkdir(repo_build_dir) + cmake_script.cwd(repo_build_dir) + cmake_script.cmake( + backend_cmake_args(images, components, be, repo_install_dir, library_paths) + ) + cmake_script.makeinstall() + + if be == "tensorrtllm": + tensorrtllm_be_dir = os.path.join(build_dir, be) + tensorrtllm_postbuild(cmake_script, repo_install_dir, tensorrtllm_be_dir) + + cmake_script.mkdir(os.path.join(install_dir, "backends")) + cmake_script.rmdir(os.path.join(install_dir, "backends", be)) + + cmake_script.cpdir( + os.path.join(repo_install_dir, "backends", be), + os.path.join(install_dir, "backends"), + ) + + cmake_script.comment() + cmake_script.comment(f"end '{be}' backend") + cmake_script.commentln(8) + cmake_script.blankln() + + +def backend_clone( + be, + clone_script, + tag, + build_dir, + install_dir, + github_organization, +): + clone_script.commentln(8) + clone_script.comment(f"'{be}' backend") + clone_script.comment("Delete this section to remove backend from build") + clone_script.comment() + clone_script.mkdir(build_dir) + clone_script.cwd(build_dir) + clone_script.gitclone(backend_repo(be), tag, be, github_organization) + + repo_target_dir = os.path.join(install_dir, "backends") + clone_script.mkdir(repo_target_dir) + backend_dir = os.path.join(repo_target_dir, be) + clone_script.rmdir(backend_dir) + clone_script.mkdir(backend_dir) + + clone_script.cp( + os.path.join(build_dir, be, "src", "model.py"), + backend_dir, + ) + clone_script.cpdir( + os.path.join(build_dir, be, "src", "utils"), + backend_dir, + ) + + clone_script.comment() + clone_script.comment(f"end '{be}' backend") + clone_script.commentln(8) + clone_script.blankln() + + +def repo_agent_build( + ra, cmake_script, build_dir, install_dir, repoagent_repo, repoagents +): + repo_build_dir = os.path.join(build_dir, ra, "build") + repo_install_dir = os.path.join(build_dir, ra, "install") + + cmake_script.commentln(8) + cmake_script.comment(f"'{ra}' repository agent") + cmake_script.comment("Delete this section to remove repository agent from build") + cmake_script.comment() + cmake_script.mkdir(build_dir) + cmake_script.cwd(build_dir) + cmake_script.gitclone( + repoagent_repo(ra), repoagents[ra], ra, FLAGS.github_organization + ) + + cmake_script.mkdir(repo_build_dir) + cmake_script.cwd(repo_build_dir) + cmake_script.cmake(repoagent_cmake_args(images, components, ra, repo_install_dir)) + cmake_script.makeinstall() + + cmake_script.mkdir(os.path.join(install_dir, "repoagents")) + cmake_script.rmdir(os.path.join(install_dir, "repoagents", ra)) + cmake_script.cpdir( + os.path.join(repo_install_dir, "repoagents", ra), + os.path.join(install_dir, "repoagents"), + ) + cmake_script.comment() + cmake_script.comment(f"end '{ra}' repository agent") + cmake_script.commentln(8) + cmake_script.blankln() + + +def cache_build(cache, cmake_script, build_dir, install_dir, cache_repo, caches): + repo_build_dir = os.path.join(build_dir, cache, "build") + repo_install_dir = os.path.join(build_dir, cache, "install") + + cmake_script.commentln(8) + cmake_script.comment(f"'{cache}' cache") + cmake_script.comment("Delete this section to remove cache from build") + cmake_script.comment() + cmake_script.mkdir(build_dir) + cmake_script.cwd(build_dir) + cmake_script.gitclone( + cache_repo(cache), caches[cache], cache, FLAGS.github_organization + ) + + cmake_script.mkdir(repo_build_dir) + cmake_script.cwd(repo_build_dir) + cmake_script.cmake(cache_cmake_args(images, components, cache, repo_install_dir)) + cmake_script.makeinstall() + + cmake_script.mkdir(os.path.join(install_dir, "caches")) + cmake_script.rmdir(os.path.join(install_dir, "caches", cache)) + cmake_script.cpdir( + os.path.join(repo_install_dir, "caches", cache), + os.path.join(install_dir, "caches"), + ) + cmake_script.comment() + cmake_script.comment(f"end '{cache}' cache") + cmake_script.commentln(8) + cmake_script.blankln() + + +def cibase_build( + cmake_script, repo_dir, cmake_dir, build_dir, install_dir, ci_dir, backends +): + repo_install_dir = os.path.join(build_dir, "tritonserver", "install") + + cmake_script.commentln(8) + cmake_script.comment("Collect Triton CI artifacts") + cmake_script.comment() + + cmake_script.mkdir(ci_dir) + + # On windows we are not yet using a CI/QA docker image for + # testing, so don't do anything... + if target_platform() == "windows": + return + + # The core build produces some artifacts that are needed for CI + # testing, so include those in the install. + cmake_script.cpdir(os.path.join(repo_dir, "qa"), ci_dir) + cmake_script.cpdir(os.path.join(repo_dir, "deploy"), ci_dir) + cmake_script.mkdir(os.path.join(ci_dir, "docs")) + cmake_script.cpdir( + os.path.join(repo_dir, "docs", "examples"), os.path.join(ci_dir, "docs") + ) + cmake_script.mkdir(os.path.join(ci_dir, "src", "test")) + cmake_script.cpdir( + os.path.join(repo_dir, "src", "test", "models"), + os.path.join(ci_dir, "src", "test"), + ) + # Skip copying the artifacts in the bin, lib, and python as those directories will + # be missing when the core build is not enabled. + if not FLAGS.no_core_build: + cmake_script.cpdir(os.path.join(repo_install_dir, "bin"), ci_dir) + cmake_script.mkdir(os.path.join(ci_dir, "lib")) + cmake_script.cp( + os.path.join(repo_install_dir, "lib", "libtritonrepoagent_relocation.so"), + os.path.join(ci_dir, "lib"), + ) + cmake_script.cpdir(os.path.join(repo_install_dir, "python"), ci_dir) + + # Some of the backends are needed for CI testing + cmake_script.mkdir(os.path.join(ci_dir, "backends")) + for be in ("identity", "repeat", "square"): + be_install_dir = os.path.join(build_dir, be, "install", "backends", be) + if target_platform() == "windows": + cmake_script.cmd(f"if (Test-Path -Path {be_install_dir}) {{") + else: + cmake_script.cmd(f"if [[ -e {be_install_dir} ]]; then") + cmake_script.cpdir(be_install_dir, os.path.join(ci_dir, "backends")) + cmake_script.cmd("}" if target_platform() == "windows" else "fi") + + # Some of the unit-test built backends are needed for CI testing + cmake_script.mkdir(os.path.join(ci_dir, "tritonbuild", "tritonserver", "backends")) + for be in ( + "query", + "implicit_state", + "sequence", + "dyna_sequence", + "distributed_addsub", + "iterative_sequence", + ): + be_install_dir = os.path.join(repo_install_dir, "backends", be) + if target_platform() == "windows": + cmake_script.cmd(f"if (Test-Path -Path {be_install_dir}) {{") + else: + cmake_script.cmd(f"if [[ -e {be_install_dir} ]]; then") + cmake_script.cpdir( + be_install_dir, + os.path.join(ci_dir, "tritonbuild", "tritonserver", "backends"), + ) + cmake_script.cmd("}" if target_platform() == "windows" else "fi") + + # The onnxruntime_backend build produces some artifacts that + # are needed for CI testing. + if "onnxruntime" in backends: + ort_install_dir = os.path.join(build_dir, "onnxruntime", "install") + cmake_script.mkdir(os.path.join(ci_dir, "qa", "L0_custom_ops")) + if target_platform() != "igpu": + cmake_script.cp( + os.path.join(ort_install_dir, "test", "libcustom_op_library.so"), + os.path.join(ci_dir, "qa", "L0_custom_ops"), + ) + cmake_script.cp( + os.path.join(ort_install_dir, "test", "custom_op_test.onnx"), + os.path.join(ci_dir, "qa", "L0_custom_ops"), + ) + # [WIP] other way than wildcard? + backend_tests = os.path.join(build_dir, "onnxruntime", "test", "*") + cmake_script.cpdir(backend_tests, os.path.join(ci_dir, "qa")) + + # Need the build area for some backends so that they can be + # rebuilt with specific options. + cmake_script.mkdir(os.path.join(ci_dir, "tritonbuild")) + for be in ("identity", "python"): + if be in backends: + cmake_script.rmdir(os.path.join(build_dir, be, "build")) + cmake_script.rmdir(os.path.join(build_dir, be, "install")) + cmake_script.cpdir( + os.path.join(build_dir, be), os.path.join(ci_dir, "tritonbuild") + ) + + cmake_script.comment() + cmake_script.comment("end Triton CI artifacts") + cmake_script.commentln(8) + cmake_script.blankln() + + +def finalize_build(cmake_script, install_dir, ci_dir): + cmake_script.cmd(f"chmod -R a+rw {install_dir}") + cmake_script.cmd(f"chmod -R a+rw {ci_dir}") + + +def enable_all(): + if target_platform() != "windows": + all_backends = [ + "ensemble", + "identity", + "square", + "repeat", + "tensorflow", + "onnxruntime", + "python", + "dali", + "pytorch", + "openvino", + "fil", + "tensorrt", + ] + all_repoagents = ["checksum"] + all_caches = ["local", "redis"] + all_filesystems = ["gcs", "s3", "azure_storage"] + all_endpoints = ["http", "grpc", "sagemaker", "vertex-ai"] + + FLAGS.enable_logging = True + FLAGS.enable_stats = True + FLAGS.enable_metrics = True + FLAGS.enable_gpu_metrics = True + FLAGS.enable_cpu_metrics = True + FLAGS.enable_tracing = True + FLAGS.enable_nvtx = True + FLAGS.enable_gpu = True + else: + all_backends = [ + "ensemble", + "identity", + "square", + "repeat", + "onnxruntime", + "openvino", + "tensorrt", + ] + all_repoagents = ["checksum"] + all_caches = ["local", "redis"] + all_filesystems = [] + all_endpoints = ["http", "grpc"] + + FLAGS.enable_logging = True + FLAGS.enable_stats = True + FLAGS.enable_tracing = True + FLAGS.enable_gpu = True + + requested_backends = [] + for be in FLAGS.backend: + parts = be.split(":") + requested_backends += [parts[0]] + for be in all_backends: + if be not in requested_backends: + FLAGS.backend += [be] + + requested_repoagents = [] + for ra in FLAGS.repoagent: + parts = ra.split(":") + requested_repoagents += [parts[0]] + for ra in all_repoagents: + if ra not in requested_repoagents: + FLAGS.repoagent += [ra] + + requested_caches = [] + for cache in FLAGS.cache: + parts = cache.split(":") + requested_caches += [parts[0]] + for cache in all_caches: + if cache not in requested_caches: + FLAGS.cache += [cache] + + for fs in all_filesystems: + if fs not in FLAGS.filesystem: + FLAGS.filesystem += [fs] + + for ep in all_endpoints: + if ep not in FLAGS.endpoint: + FLAGS.endpoint += [ep] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + group_qv = parser.add_mutually_exclusive_group() + group_qv.add_argument( + "-q", + "--quiet", + action="store_true", + required=False, + help="Disable console output.", + ) + group_qv.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + help="Enable verbose output.", + ) + + parser.add_argument( + "--dryrun", + action="store_true", + required=False, + help="Output the build scripts, but do not perform build.", + ) + parser.add_argument( + "--no-container-build", + action="store_true", + required=False, + help="Do not use Docker container for build.", + ) + parser.add_argument( + "--no-container-interactive", + action="store_true", + required=False, + help='Do not use -it argument to "docker run" when performing container build.', + ) + parser.add_argument( + "--no-container-pull", + action="store_true", + required=False, + help="Do not use Docker --pull argument when building container.", + ) + parser.add_argument( + "--container-memory", + default=None, + required=False, + help="Value for Docker --memory argument. Used only for windows builds.", + ) + parser.add_argument( + "--target-platform", + required=False, + default=None, + help='Target platform for build, can be "linux", "rhel", "windows" or "igpu". If not specified, build targets the current platform.', + ) + parser.add_argument( + "--target-machine", + required=False, + default=None, + help="Target machine/architecture for build. If not specified, build targets the current machine/architecture.", + ) + + parser.add_argument( + "--build-id", + type=str, + required=False, + help="Build ID associated with the build.", + ) + parser.add_argument( + "--build-sha", type=str, required=False, help="SHA associated with the build." + ) + parser.add_argument( + "--build-dir", + type=str, + required=False, + help="Build directory. All repo clones and builds will be performed in this directory.", + ) + parser.add_argument( + "--install-dir", + type=str, + required=False, + default=None, + help="Install directory, default is <builddir>/opt/tritonserver.", + ) + parser.add_argument( + "--cmake-dir", + type=str, + required=False, + help="Directory containing the CMakeLists.txt file for Triton server.", + ) + parser.add_argument( + "--tmp-dir", + type=str, + required=False, + default="/tmp", + help="Temporary directory used for building inside docker. Default is /tmp.", + ) + parser.add_argument( + "--library-paths", + action="append", + required=False, + default=None, + help="Specify library paths for respective backends in build as <backend-name>[:<library_path>].", + ) + parser.add_argument( + "--build-type", + required=False, + default="Release", + help='Build type, one of "Release", "Debug", "RelWithDebInfo" or "MinSizeRel". Default is "Release".', + ) + parser.add_argument( + "-j", + "--build-parallel", + type=int, + required=False, + default=None, + help="Build parallelism. Defaults to 2 * number-of-cores.", + ) + + parser.add_argument( + "--github-organization", + type=str, + required=False, + default="https://github.com/triton-inference-server", + help='The GitHub organization containing the repos used for the build. Defaults to "https://github.com/triton-inference-server".', + ) + parser.add_argument( + "--version", + type=str, + required=False, + help="The Triton version. If not specified defaults to the value in the TRITON_VERSION file.", + ) + parser.add_argument( + "--container-version", + type=str, + required=False, + help="The Triton container version to build. If not specified the container version will be chosen automatically based on --version value.", + ) + parser.add_argument( + "--upstream-container-version", + type=str, + required=False, + help="The upstream container version to use for the build. If not specified the upstream container version will be chosen automatically based on --version value.", + ) + parser.add_argument( + "--container-prebuild-command", + type=str, + required=False, + help="When performing a container build, this command will be executed within the container just before the build it performed.", + ) + parser.add_argument( + "--no-container-source", + action="store_true", + required=False, + help="Do not include OSS source code in Docker container.", + ) + parser.add_argument( + "--image", + action="append", + required=False, + help='Use specified Docker image in build as <image-name>,<full-image-name>. <image-name> can be "base", "gpu-base", "tensorflow", or "pytorch".', + ) + + parser.add_argument( + "--enable-all", + action="store_true", + required=False, + help="Enable all standard released Triton features, backends, repository agents, caches, endpoints and file systems.", + ) + parser.add_argument( + "--enable-logging", action="store_true", required=False, help="Enable logging." + ) + parser.add_argument( + "--enable-stats", + action="store_true", + required=False, + help="Enable statistics collection.", + ) + parser.add_argument( + "--enable-metrics", + action="store_true", + required=False, + help="Enable metrics reporting.", + ) + parser.add_argument( + "--enable-gpu-metrics", + action="store_true", + required=False, + help="Include GPU metrics in reported metrics.", + ) + parser.add_argument( + "--enable-cpu-metrics", + action="store_true", + required=False, + help="Include CPU metrics in reported metrics.", + ) + parser.add_argument( + "--enable-tracing", action="store_true", required=False, help="Enable tracing." + ) + parser.add_argument( + "--enable-nvtx", action="store_true", required=False, help="Enable NVTX." + ) + parser.add_argument( + "--enable-gpu", action="store_true", required=False, help="Enable GPU support." + ) + parser.add_argument( + "--enable-mali-gpu", + action="store_true", + required=False, + help="Enable ARM MALI GPU support.", + ) + parser.add_argument( + "--min-compute-capability", + type=str, + required=False, + default="6.0", + help="Minimum CUDA compute capability supported by server.", + ) + + parser.add_argument( + "--endpoint", + action="append", + required=False, + help='Include specified endpoint in build. Allowed values are "grpc", "http", "vertex-ai" and "sagemaker".', + ) + parser.add_argument( + "--filesystem", + action="append", + required=False, + help='Include specified filesystem in build. Allowed values are "gcs", "azure_storage" and "s3".', + ) + parser.add_argument( + "--no-core-build", + action="store_true", + required=False, + help="Do not build Triton core shared library or executable.", + ) + parser.add_argument( + "--backend", + action="append", + required=False, + help='Include specified backend in build as <backend-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).', + ) + parser.add_argument( + "--repo-tag", + action="append", + required=False, + help='The version of a component to use in the build as <component-name>:<repo-tag>. <component-name> can be "common", "core", "backend" or "thirdparty". <repo-tag> indicates the git tag/branch to use for the build. Currently <repo-tag> does not support pull-request reference. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).', + ) + parser.add_argument( + "--repoagent", + action="append", + required=False, + help='Include specified repo agent in build as <repoagent-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).', + ) + parser.add_argument( + "--cache", + action="append", + required=False, + help='Include specified cache in build as <cache-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).', + ) + parser.add_argument( + "--no-force-clone", + action="store_true", + default=False, + help="Do not create fresh clones of repos that have already been cloned.", + ) + parser.add_argument( + "--extra-core-cmake-arg", + action="append", + required=False, + help="Extra CMake argument as <name>=<value>. The argument is passed to CMake as -D<name>=<value> and is included after all CMake arguments added by build.py for the core builds.", + ) + parser.add_argument( + "--override-core-cmake-arg", + action="append", + required=False, + help="Override specified CMake argument in the build as <name>=<value>. The argument is passed to CMake as -D<name>=<value>. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the core build use --extra-core-cmake-arg.", + ) + parser.add_argument( + "--extra-backend-cmake-arg", + action="append", + required=False, + help="Extra CMake argument for a backend build as <backend>:<name>=<value>. The argument is passed to CMake as -D<name>=<value> and is included after all CMake arguments added by build.py for the backend.", + ) + parser.add_argument( + "--override-backend-cmake-arg", + action="append", + required=False, + help="Override specified backend CMake argument in the build as <backend>:<name>=<value>. The argument is passed to CMake as -D<name>=<value>. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the backend build use --extra-backend-cmake-arg.", + ) + + FLAGS = parser.parse_args() + + if FLAGS.image is None: + FLAGS.image = [] + if FLAGS.repo_tag is None: + FLAGS.repo_tag = [] + if FLAGS.backend is None: + FLAGS.backend = [] + if FLAGS.endpoint is None: + FLAGS.endpoint = [] + if FLAGS.filesystem is None: + FLAGS.filesystem = [] + if FLAGS.repoagent is None: + FLAGS.repoagent = [] + if FLAGS.cache is None: + FLAGS.cache = [] + if FLAGS.library_paths is None: + FLAGS.library_paths = [] + if FLAGS.extra_core_cmake_arg is None: + FLAGS.extra_core_cmake_arg = [] + if FLAGS.override_core_cmake_arg is None: + FLAGS.override_core_cmake_arg = [] + if FLAGS.override_backend_cmake_arg is None: + FLAGS.override_backend_cmake_arg = [] + if FLAGS.extra_backend_cmake_arg is None: + FLAGS.extra_backend_cmake_arg = [] + + # if --enable-all is specified, then update FLAGS to enable all + # settings, backends, repo-agents, caches, file systems, endpoints, etc. + if FLAGS.enable_all: + enable_all() + + # When doing a docker build, --build-dir, --install-dir and + # --cmake-dir must not be set. We will use the build/ subdir + # within the server/ repo that contains this build.py script for + # --build-dir. If not doing a docker build, --build-dir must be + # set. + if FLAGS.no_container_build: + if FLAGS.build_dir is None: + fail("--no-container-build requires --build-dir") + if FLAGS.install_dir is None: + FLAGS.install_dir = os.path.join(FLAGS.build_dir, "opt", "tritonserver") + if FLAGS.cmake_dir is None: + FLAGS.cmake_dir = THIS_SCRIPT_DIR + else: + if FLAGS.build_dir is not None: + fail("--build-dir must not be set for container-based build") + if FLAGS.install_dir is not None: + fail("--install-dir must not be set for container-based build") + if FLAGS.cmake_dir is not None: + fail("--cmake-dir must not be set for container-based build") + FLAGS.build_dir = os.path.join(THIS_SCRIPT_DIR, "build") + + # Determine the versions. Start with Triton version, if --version + # is not explicitly specified read from TRITON_VERSION file. + if FLAGS.version is None: + with open(os.path.join(THIS_SCRIPT_DIR, "TRITON_VERSION"), "r") as vfile: + FLAGS.version = vfile.readline().strip() + + if FLAGS.build_parallel is None: + FLAGS.build_parallel = multiprocessing.cpu_count() * 2 + + log("Building Triton Inference Server") + log("platform {}".format(target_platform())) + log("machine {}".format(target_machine())) + log("version {}".format(FLAGS.version)) + log("build dir {}".format(FLAGS.build_dir)) + log("install dir {}".format(FLAGS.install_dir)) + log("cmake dir {}".format(FLAGS.cmake_dir)) + + # Determine the default repo-tag that should be used for images, + # backends, repo-agents, and caches if a repo-tag is not given + # explicitly. For release branches we use the release branch as + # the default, otherwise we use 'main'. + default_repo_tag = "main" + cver = FLAGS.container_version + if cver is None: + if FLAGS.version not in TRITON_VERSION_MAP: + fail( + "unable to determine default repo-tag, container version not known for {}".format( + FLAGS.version + ) + ) + cver = TRITON_VERSION_MAP[FLAGS.version][0] + if not cver.endswith("dev"): + default_repo_tag = "r" + cver + log("default repo-tag: {}".format(default_repo_tag)) + + # For other versions use the TRITON_VERSION_MAP unless explicitly + # given. + FLAGS.container_version, FLAGS.upstream_container_version = container_versions( + FLAGS.version, FLAGS.container_version, FLAGS.upstream_container_version + ) + + log("container version {}".format(FLAGS.container_version)) + log("upstream container version {}".format(FLAGS.upstream_container_version)) + + for ep in FLAGS.endpoint: + log(f'endpoint "{ep}"') + for fs in FLAGS.filesystem: + log(f'filesystem "{fs}"') + + # Initialize map of backends to build and repo-tag for each. + backends = {} + for be in FLAGS.backend: + parts = be.split(":") + if len(parts) == 1: + parts.append(default_repo_tag) + if parts[0] == "tensorflow1": + fail( + "Starting from Triton version 23.04, support for TensorFlow 1 has been discontinued. Please switch to Tensorflow 2." + ) + if parts[0] == "tensorflow2": + parts[0] = "tensorflow" + log('backend "{}" at tag/branch "{}"'.format(parts[0], parts[1])) + backends[parts[0]] = parts[1] + + if "vllm" in backends: + if "python" not in backends: + log( + "vLLM backend requires Python backend, adding Python backend with tag {}".format( + backends["vllm"] + ) + ) + backends["python"] = backends["vllm"] + + # Initialize map of repo agents to build and repo-tag for each. + repoagents = {} + for be in FLAGS.repoagent: + parts = be.split(":") + if len(parts) == 1: + parts.append(default_repo_tag) + log('repoagent "{}" at tag/branch "{}"'.format(parts[0], parts[1])) + repoagents[parts[0]] = parts[1] + + # Initialize map of caches to build and repo-tag for each. + caches = {} + for be in FLAGS.cache: + parts = be.split(":") + if len(parts) == 1: + parts.append(default_repo_tag) + log('cache "{}" at tag/branch "{}"'.format(parts[0], parts[1])) + caches[parts[0]] = parts[1] + + # Initialize map of docker images. + images = {} + for img in FLAGS.image: + parts = img.split(",") + fail_if( + len(parts) != 2, "--image must specify <image-name>,<full-image-registry>" + ) + fail_if( + parts[0] + not in ["base", "gpu-base", "pytorch", "tensorflow", "tensorflow2"], + "unsupported value for --image", + ) + log('image "{}": "{}"'.format(parts[0], parts[1])) + if parts[0] == "tensorflow2": + parts[0] = "tensorflow" + images[parts[0]] = parts[1] + + # Initialize map of library paths for each backend. + library_paths = {} + for lpath in FLAGS.library_paths: + parts = lpath.split(":") + if len(parts) == 2: + log('backend "{}" library path "{}"'.format(parts[0], parts[1])) + if parts[0] == "tensorflow2": + parts[0] = "tensorflow" + library_paths[parts[0]] = parts[1] + + # Parse any explicitly specified cmake arguments + for cf in FLAGS.extra_core_cmake_arg: + parts = cf.split("=") + fail_if(len(parts) != 2, "--extra-core-cmake-arg must specify <name>=<value>") + log('CMake core extra "-D{}={}"'.format(parts[0], parts[1])) + EXTRA_CORE_CMAKE_FLAGS[parts[0]] = parts[1] + + for cf in FLAGS.override_core_cmake_arg: + parts = cf.split("=") + fail_if( + len(parts) != 2, "--override-core-cmake-arg must specify <name>=<value>" + ) + log('CMake core override "-D{}={}"'.format(parts[0], parts[1])) + OVERRIDE_CORE_CMAKE_FLAGS[parts[0]] = parts[1] + + for cf in FLAGS.extra_backend_cmake_arg: + parts = cf.split(":", 1) + fail_if( + len(parts) != 2, + "--extra-backend-cmake-arg must specify <backend>:<name>=<value>", + ) + be = parts[0] + parts = parts[1].split("=", 1) + fail_if( + len(parts) != 2, + "--extra-backend-cmake-arg must specify <backend>:<name>=<value>", + ) + fail_if( + be not in backends, + '--extra-backend-cmake-arg specifies backend "{}" which is not included in build'.format( + be + ), + ) + log('backend "{}" CMake extra "-D{}={}"'.format(be, parts[0], parts[1])) + if be not in EXTRA_BACKEND_CMAKE_FLAGS: + EXTRA_BACKEND_CMAKE_FLAGS[be] = {} + EXTRA_BACKEND_CMAKE_FLAGS[be][parts[0]] = parts[1] + + for cf in FLAGS.override_backend_cmake_arg: + parts = cf.split(":", 1) + fail_if( + len(parts) != 2, + "--override-backend-cmake-arg must specify <backend>:<name>=<value>", + ) + be = parts[0] + parts = parts[1].split("=", 1) + fail_if( + len(parts) != 2, + "--override-backend-cmake-arg must specify <backend>:<name>=<value>", + ) + fail_if( + be not in backends, + '--override-backend-cmake-arg specifies backend "{}" which is not included in build'.format( + be + ), + ) + log('backend "{}" CMake override "-D{}={}"'.format(be, parts[0], parts[1])) + if be not in OVERRIDE_BACKEND_CMAKE_FLAGS: + OVERRIDE_BACKEND_CMAKE_FLAGS[be] = {} + OVERRIDE_BACKEND_CMAKE_FLAGS[be][parts[0]] = parts[1] + + # Initialize map of common components and repo-tag for each. + components = { + "common": default_repo_tag, + "core": default_repo_tag, + "backend": default_repo_tag, + "thirdparty": default_repo_tag, + } + for be in FLAGS.repo_tag: + parts = be.split(":") + fail_if(len(parts) != 2, "--repo-tag must specify <component-name>:<repo-tag>") + fail_if( + parts[0] not in components, + '--repo-tag <component-name> must be "common", "core", "backend", or "thirdparty"', + ) + components[parts[0]] = parts[1] + for c in components: + log('component "{}" at tag/branch "{}"'.format(c, components[c])) + + # Set the build, install, and cmake directories to use for the + # generated build scripts and Dockerfiles. If building without + # Docker, these are the directories specified on the cmdline. If + # building with Docker, we change these to be directories within + # FLAGS.tmp_dir inside the Docker container. + script_repo_dir = THIS_SCRIPT_DIR + script_build_dir = FLAGS.build_dir + script_install_dir = script_ci_dir = FLAGS.install_dir + script_cmake_dir = FLAGS.cmake_dir + if not FLAGS.no_container_build: + # FLAGS.tmp_dir may be specified with "\" on Windows, adjust + # to "/" for docker usage. + script_build_dir = os.path.normpath( + os.path.join(FLAGS.tmp_dir, "tritonbuild").replace("\\", "/") + ) + script_install_dir = os.path.normpath(os.path.join(script_build_dir, "install")) + script_ci_dir = os.path.normpath(os.path.join(script_build_dir, "ci")) + if target_platform() == "windows": + script_repo_dir = script_cmake_dir = os.path.normpath("c:/workspace") + else: + script_repo_dir = script_cmake_dir = "/workspace" + + script_name = "cmake_build" + if target_platform() == "windows": + script_name += ".ps1" + + # Write the build script that invokes cmake for the core, backends, repo-agents, and caches. + pathlib.Path(FLAGS.build_dir).mkdir(parents=True, exist_ok=True) + with BuildScript( + os.path.join(FLAGS.build_dir, script_name), + verbose=FLAGS.verbose, + desc=("Build script for Triton Inference Server"), + ) as cmake_script: + # Run the container pre-build command if the cmake build is + # being done within the build container. + if not FLAGS.no_container_build and FLAGS.container_prebuild_command: + cmake_script.cmd(FLAGS.container_prebuild_command, check_exitcode=True) + cmake_script.blankln() + + # Commands to build the core shared library and the server executable. + if not FLAGS.no_core_build: + core_build( + cmake_script, + script_repo_dir, + script_cmake_dir, + script_build_dir, + script_install_dir, + components, + backends, + ) + + # Commands to build each backend... + for be in backends: + # Core backends are not built separately from core so skip... + if be in CORE_BACKENDS: + continue + + # If armnn_tflite backend, source from external repo for git clone + if be == "armnn_tflite": + github_organization = "https://gitlab.com/arm-research/smarter/" + else: + github_organization = FLAGS.github_organization + + if be == "vllm": + backend_clone( + be, + cmake_script, + backends[be], + script_build_dir, + script_install_dir, + github_organization, + ) + else: + backend_build( + be, + cmake_script, + backends[be], + script_build_dir, + script_install_dir, + github_organization, + images, + components, + library_paths, + ) + + # Commands to build each repo agent... + for ra in repoagents: + repo_agent_build( + ra, + cmake_script, + script_build_dir, + script_install_dir, + repoagent_repo, + repoagents, + ) + + # Commands to build each cache... + for cache in caches: + cache_build( + cache, + cmake_script, + script_build_dir, + script_install_dir, + cache_repo, + caches, + ) + + # Commands needed only when building with Docker... + if not FLAGS.no_container_build: + # Commands to collect all the build artifacts needed for CI + # testing. + cibase_build( + cmake_script, + script_repo_dir, + script_cmake_dir, + script_build_dir, + script_install_dir, + script_ci_dir, + backends, + ) + + # When building with Docker the install and ci artifacts + # written to the build-dir while running the docker container + # may have root ownership, so give them permissions to be + # managed by all users on the host system. + if target_platform() != "windows": + finalize_build(cmake_script, script_install_dir, script_ci_dir) + + # If --no-container-build is not specified then we perform the + # actual build within a docker container and from that create the + # final tritonserver docker image. For the build we need to + # generate a few Dockerfiles and a top-level script that drives + # the build process. + if not FLAGS.no_container_build: + script_name = "docker_build" + if target_platform() == "windows": + script_name += ".ps1" + + create_build_dockerfiles( + script_build_dir, images, backends, repoagents, caches, FLAGS.endpoint + ) + create_docker_build_script(script_name, script_install_dir, script_ci_dir) + + # In not dry-run, execute the script to perform the build... If a + # container-based build is requested use 'docker_build' script, + # otherwise build directly on this system using cmake script. + if not FLAGS.dryrun: + if target_platform() == "windows": + p = subprocess.Popen( + ["powershell.exe", "-noexit", "-File", f"./{script_name}"], + cwd=FLAGS.build_dir, + ) + else: + p = subprocess.Popen([f"./{script_name}"], cwd=FLAGS.build_dir) + p.wait() + fail_if(p.returncode != 0, "build failed") diff --git a/compose.py b/compose.py new file mode 100755 index 0000000000..14b58c93f6 --- /dev/null +++ b/compose.py @@ -0,0 +1,525 @@ +#!/usr/bin/env python3 +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse +import os +import platform +import subprocess +import sys + +FLAGS = None + + +#### helper functions +def log(msg, force=False): + if force or not FLAGS.quiet: + try: + print(msg, file=sys.stderr) + except Exception: + print("<failed to log>", file=sys.stderr) + + +def log_verbose(msg): + if FLAGS.verbose: + log(msg, force=True) + + +def fail(msg): + print("error: {}".format(msg), file=sys.stderr) + sys.exit(1) + + +def fail_if(p, msg): + if p: + fail(msg) + + +def start_dockerfile(ddir, images, argmap, dockerfile_name, backends): + # Set environment variables, set default user and install dependencies + df = """ +# +# Multistage build. +# +ARG TRITON_VERSION={} +ARG TRITON_CONTAINER_VERSION={} + +FROM {} AS full +""".format( + argmap["TRITON_VERSION"], argmap["TRITON_CONTAINER_VERSION"], images["full"] + ) + + # PyTorch, TensorFlow backends need extra CUDA and other + # dependencies during runtime that are missing in the CPU-only base container. + # These dependencies must be copied from the Triton Min image. + if not FLAGS.enable_gpu and ( + ("pytorch" in backends) + or ("tensorflow" in backends) + or ("tensorflow2" in backends) + ): + df += """ +FROM {} AS min_container + +""".format( + images["gpu-min"] + ) + + df += """ +FROM {} +""".format( + images["min"] + ) + + import build + + df += build.dockerfile_prepare_container_linux( + argmap, backends, FLAGS.enable_gpu, platform.machine().lower() + ) + # Copy over files + df += """ +WORKDIR /opt/tritonserver +COPY --chown=1000:1000 --from=full /opt/tritonserver/LICENSE . +COPY --chown=1000:1000 --from=full /opt/tritonserver/TRITON_VERSION . +COPY --chown=1000:1000 --from=full /opt/tritonserver/NVIDIA_Deep_Learning_Container_License.pdf . +COPY --chown=1000:1000 --from=full /opt/tritonserver/bin bin/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/lib lib/ +COPY --chown=1000:1000 --from=full /opt/tritonserver/include include/ +""" + with open(os.path.join(ddir, dockerfile_name), "w") as dfile: + dfile.write(df) + + +def add_requested_backends(ddir, dockerfile_name, backends): + df = "# Copying over backends \n" + for backend in backends: + df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/{} /opt/tritonserver/backends/{} +""".format( + backend, backend + ) + if len(backends) > 0: + df += """ +# Top-level /opt/tritonserver/backends not copied so need to explicitly set permissions here +RUN chown triton-server:triton-server /opt/tritonserver/backends +""" + with open(os.path.join(ddir, dockerfile_name), "a") as dfile: + dfile.write(df) + + +def add_requested_repoagents(ddir, dockerfile_name, repoagents): + df = "# Copying over repoagents \n" + for ra in repoagents: + df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/{} /opt/tritonserver/repoagents/{} +""".format( + ra, ra + ) + if len(repoagents) > 0: + df += """ +# Top-level /opt/tritonserver/repoagents not copied so need to explicitly set permissions here +RUN chown triton-server:triton-server /opt/tritonserver/repoagents +""" + with open(os.path.join(ddir, dockerfile_name), "a") as dfile: + dfile.write(df) + + +def add_requested_caches(ddir, dockerfile_name, caches): + df = "# Copying over caches \n" + for cache in caches: + df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/caches/{} /opt/tritonserver/caches/{} +""".format( + cache, cache + ) + if len(caches) > 0: + df += """ +# Top-level /opt/tritonserver/caches not copied so need to explicitly set permissions here +RUN chown triton-server:triton-server /opt/tritonserver/caches +""" + with open(os.path.join(ddir, dockerfile_name), "a") as dfile: + dfile.write(df) + + +def end_dockerfile(ddir, dockerfile_name, argmap): + # Install additional dependencies + df = "" + if argmap["SAGEMAKER_ENDPOINT"]: + df += """ +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true +COPY --chown=1000:1000 --from=full /usr/bin/serve /usr/bin/. +""" + with open(os.path.join(ddir, dockerfile_name), "a") as dfile: + dfile.write(df) + + +def build_docker_image(ddir, dockerfile_name, container_name): + # Create container with docker build + p = subprocess.Popen( + [ + "docker", + "build", + "-t", + container_name, + "-f", + os.path.join(ddir, dockerfile_name), + ".", + ] + ) + p.wait() + fail_if(p.returncode != 0, "docker build {} failed".format(container_name)) + + +def get_container_version_if_not_specified(): + if FLAGS.container_version is None: + # Read from TRITON_VERSION file in server repo to determine version + with open("TRITON_VERSION", "r") as vfile: + version = vfile.readline().strip() + import build + + _, FLAGS.container_version = build.container_versions( + version, None, FLAGS.container_version + ) + log("version {}".format(version)) + log("using container version {}".format(FLAGS.container_version)) + + +def create_argmap(images, skip_pull): + # Extract information from upstream build and create map other functions can + # use + full_docker_image = images["full"] + min_docker_image = images["min"] + enable_gpu = FLAGS.enable_gpu + # Docker inspect environment variables + base_run_args = ["docker", "inspect", "-f"] + import re # parse all PATH environment variables + + # first pull docker images + if not skip_pull: + log("pulling container:{}".format(full_docker_image)) + p = subprocess.run(["docker", "pull", full_docker_image]) + fail_if( + p.returncode != 0, + "docker pull container {} failed, {}".format(full_docker_image, p.stderr), + ) + if enable_gpu: + if not skip_pull: + pm = subprocess.run(["docker", "pull", min_docker_image]) + fail_if( + pm.returncode != 0 and not skip_pull, + "docker pull container {} failed, {}".format( + min_docker_image, pm.stderr + ), + ) + pm_path = subprocess.run( + base_run_args + + [ + "{{range $index, $value := .Config.Env}}{{$value}} {{end}}", + min_docker_image, + ], + capture_output=True, + text=True, + ) + fail_if( + pm_path.returncode != 0, + "docker inspect to find triton environment variables for min container failed, {}".format( + pm_path.stderr + ), + ) + # min container needs to be GPU-support-enabled if the build is GPU build + vars = pm_path.stdout + e = re.search("CUDA_VERSION", vars) + gpu_enabled = False if e is None else True + fail_if( + not gpu_enabled, + "Composing container with gpu support enabled but min container provided does not have CUDA installed", + ) + + # Check full container environment variables + p_path = subprocess.run( + base_run_args + + [ + "{{range $index, $value := .Config.Env}}{{$value}} {{end}}", + full_docker_image, + ], + capture_output=True, + text=True, + ) + fail_if( + p_path.returncode != 0, + "docker inspect to find environment variables for full container failed, {}".format( + p_path.stderr + ), + ) + vars = p_path.stdout + log_verbose("inspect args: {}".format(vars)) + + e0 = re.search("TRITON_SERVER_GPU_ENABLED=([\S]{1,}) ", vars) + e1 = re.search("CUDA_VERSION", vars) + gpu_enabled = False + if e0 != None: + gpu_enabled = e0.group(1) == "1" + elif e1 != None: + gpu_enabled = True + fail_if( + gpu_enabled != enable_gpu, + "Error: full container provided was build with " + "'TRITON_SERVER_GPU_ENABLED' as {} and you are composing container" + "with 'TRITON_SERVER_GPU_ENABLED' as {}".format(gpu_enabled, enable_gpu), + ) + e = re.search("TRITON_SERVER_VERSION=([\S]{6,}) ", vars) + version = "" if e is None else e.group(1) + fail_if( + len(version) == 0, + "docker inspect to find triton server version failed, {}".format(p_path.stderr), + ) + e = re.search("NVIDIA_TRITON_SERVER_VERSION=([\S]{5,}) ", vars) + container_version = "" if e is None else e.group(1) + fail_if( + len(container_version) == 0, + "docker inspect to find triton container version failed, {}".format(vars), + ) + dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars) + dcgm_version = "" + if dcgm_ver is None: + dcgm_version = "2.2.3" + log( + "WARNING: DCGM version not found from image, installing the earlierst version {}".format( + dcgm_version + ) + ) + else: + dcgm_version = dcgm_ver.group(1) + fail_if( + len(dcgm_version) == 0, + "docker inspect to find DCGM version failed, {}".format(vars), + ) + + p_sha = subprocess.run( + base_run_args + + ['{{ index .Config.Labels "com.nvidia.build.ref"}}', full_docker_image], + capture_output=True, + text=True, + ) + fail_if( + p_sha.returncode != 0, + "docker inspect of upstream docker image build sha failed, {}".format( + p_sha.stderr + ), + ) + p_build = subprocess.run( + base_run_args + + ['{{ index .Config.Labels "com.nvidia.build.id"}}', full_docker_image], + capture_output=True, + text=True, + ) + fail_if( + p_build.returncode != 0, + "docker inspect of upstream docker image build sha failed, {}".format( + p_build.stderr + ), + ) + + p_find = subprocess.run( + ["docker", "run", full_docker_image, "bash", "-c", "ls /usr/bin/"], + capture_output=True, + text=True, + ) + f = re.search("serve", p_find.stdout) + fail_if( + p_find.returncode != 0, + "Cannot search for 'serve' in /usr/bin, {}".format(p_find.stderr), + ) + argmap = { + "NVIDIA_BUILD_REF": p_sha.stdout.rstrip(), + "NVIDIA_BUILD_ID": p_build.stdout.rstrip(), + "TRITON_VERSION": version, + "TRITON_CONTAINER_VERSION": container_version, + "DCGM_VERSION": dcgm_version, + "SAGEMAKER_ENDPOINT": f is not None, + } + return argmap + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + group_qv = parser.add_mutually_exclusive_group() + group_qv.add_argument( + "-q", + "--quiet", + action="store_true", + required=False, + help="Disable console output.", + ) + group_qv.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + help="Enable verbose output.", + ) + parser.add_argument( + "--output-name", + type=str, + required=False, + help='Name for the generated Docker image. Default is "tritonserver".', + ) + parser.add_argument( + "--work-dir", + type=str, + required=False, + help="Generated dockerfiles are placed here. Default to current directory.", + ) + parser.add_argument( + "--container-version", + type=str, + required=False, + help="The version to use for the generated Docker image. If not specified " + "the container version will be chosen automatically based on the " + "repository branch.", + ) + parser.add_argument( + "--image", + action="append", + required=False, + help="Use specified Docker image to generate Docker image. Specified as " + '<image-name>,<full-image-name>. <image-name> can be "min", "gpu-min" ' + 'or "full". Both "min" and "full" need to be specified at the same time.' + 'This will override "--container-version". "gpu-min" is needed for ' + "CPU-only container to copy TensorFlow and PyTorch deps.", + ) + parser.add_argument( + "--enable-gpu", + nargs="?", + type=lambda x: (str(x).lower() == "true"), + const=True, + default=True, + required=False, + help=argparse.SUPPRESS, + ) + parser.add_argument( + "--backend", + action="append", + required=False, + help="Include <backend-name> in the generated Docker image. The flag may be " + "specified multiple times.", + ) + parser.add_argument( + "--repoagent", + action="append", + required=False, + help="Include <repoagent-name> in the generated Docker image. The flag may " + "be specified multiple times.", + ) + parser.add_argument( + "--cache", + action="append", + required=False, + help="Include <cache-name> in the generated Docker image. The flag may " + "be specified multiple times.", + ) + parser.add_argument( + "--skip-pull", + action="store_true", + required=False, + help="Do not pull the required docker images. The user is responsible " + "for pulling the upstream images needed to compose the image.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + required=False, + help="Only creates Dockerfile.compose, does not build the Docker image.", + ) + + FLAGS = parser.parse_args() + + if FLAGS.work_dir is None: + FLAGS.work_dir = "." + if FLAGS.output_name is None: + FLAGS.output_name = "tritonserver" + + dockerfile_name = "Dockerfile.compose" + + if FLAGS.backend is None: + FLAGS.backend = [] + if FLAGS.repoagent is None: + FLAGS.repoagent = [] + if FLAGS.cache is None: + FLAGS.cache = [] + + # Initialize map of docker images. + images = {} + if FLAGS.image: + for img in FLAGS.image: + parts = img.split(",") + fail_if( + len(parts) != 2, + "--image must specific <image-name>,<full-image-registry>", + ) + fail_if( + parts[0] not in ["min", "full", "gpu-min"], + "unsupported image-name '{}' for --image".format(parts[0]), + ) + log('image "{}": "{}"'.format(parts[0], parts[1])) + images[parts[0]] = parts[1] + else: + get_container_version_if_not_specified() + if FLAGS.enable_gpu: + images = { + "full": "nvcr.io/nvidia/tritonserver:{}-py3".format( + FLAGS.container_version + ), + "min": "nvcr.io/nvidia/tritonserver:{}-py3-min".format( + FLAGS.container_version + ), + } + else: + images = { + "full": "nvcr.io/nvidia/tritonserver:{}-cpu-only-py3".format( + FLAGS.container_version + ), + "min": "ubuntu:22.04", + } + fail_if(len(images) < 2, "Need to specify both 'full' and 'min' images if at all") + + # For CPU-only image we need to copy some cuda libraries and dependencies + # since we are using PyTorch, TensorFlow 1, TensorFlow 2 containers that + # are not CPU-only. + if ( + ("pytorch" in FLAGS.backend) + or ("tensorflow" in FLAGS.backend) + or ("tensorflow2" in FLAGS.backend) + ) and ("gpu-min" not in images): + images["gpu-min"] = "nvcr.io/nvidia/tritonserver:{}-py3-min".format( + FLAGS.container_version + ) + + argmap = create_argmap(images, FLAGS.skip_pull) + + start_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name, FLAGS.backend) + add_requested_backends(FLAGS.work_dir, dockerfile_name, FLAGS.backend) + add_requested_repoagents(FLAGS.work_dir, dockerfile_name, FLAGS.repoagent) + add_requested_caches(FLAGS.work_dir, dockerfile_name, FLAGS.cache) + end_dockerfile(FLAGS.work_dir, dockerfile_name, argmap) + + if not FLAGS.dry_run: + build_docker_image(FLAGS.work_dir, dockerfile_name, FLAGS.output_name) diff --git a/deploy/alibaba-cloud/README.md b/deploy/alibaba-cloud/README.md new file mode 100644 index 0000000000..98f914a693 --- /dev/null +++ b/deploy/alibaba-cloud/README.md @@ -0,0 +1,180 @@ +<!-- +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +# Deploy Triton Inference Server on PAI-EAS +* Table Of Contents + - [Description](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Description) + - [Prerequisites](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Prerequisites) + - [Demo Instruction](https://yuque.alibaba-inc.com/pai/blade/mtptqc#31bb94ef) + - [Additional Resources](https://yuque.alibaba-inc.com/pai/blade/mtptqc#89d5e680) + - [Known Issues](https://yuque.alibaba-inc.com/pai/blade/mtptqc#558ab0be) + +# Description +This repository contains information about how to deploy NVIDIA Triton Inference Server in EAS(Elastic Algorithm Service) of Alibaba-Cloud. +- EAS provides a simple way for deep learning developers to deploy their models in Alibaba Cloud. +- Using **Triton Processor** is the recommended way on EAS to deploy Triton Inference Server. Users can simply deploy a Triton Server by preparing models and creating a EAS service by setting processor type to `triton`. +- Models should be uploaded to Alibaba Cloud's OSS(Object Storage Service). User's model repository in OSS will be mounted onto local path visible to Triton Server. +- This documentation uses Triton's own example models for demo. The tensorflow inception model can be downloaded by the `fetch_models.sh` script. + +# Prerequisites +- You should register an Alibaba Cloud Account, and being able to use EAS by [eascmd](https://help.aliyun.com/document_detail/111031.html?spm=a2c4g.11186623.6.752.42356f46FN5fU1), which is a command line tool to create stop or scale services on EAS. +- Before creating an EAS service, you should buy dedicated resource groups(CPU or GPU) on EAS following this [document](https://www.alibabacloud.com/help/doc-detail/120122.htm). +- Make sure you can use OSS(Object Storage Service), the models should be uploaded into your own OSS bucket. + +# Demo Instruction +## Prepare a model repo directory in OSS +Download the tensorflow inception model via [fetch_model.sh](https://github.com/triton-inference-server/server/blob/main/docs/examples/fetch_models.sh). Then using [ossutil](https://help.aliyun.com/document_detail/50452.html?spm=a2c4g.11186623.6.833.26d66d51dPEytI) , which is a command line tool to use OSS, to upload the model to a certain OSS dir as you want. + +``` +./ossutil cp inception_graphdef/ oss://triton-model-repo/models +``` +## Create Triton Service with json config by eascmd +The following is the json we use when creating a Triton Server on EAS. +``` +{ + "name": "<your triton service name>", + "processor": "triton", + "processor_params": [ + "--model-repository=oss://triton-model-repo/models", + "--allow-grpc=true", + "--allow-http=true" + ], + "metadata": { + "instance": 1, + "cpu": 4, + "gpu": 1, + "memory": 10000, + "resource": "<your resource id>", + "rpc.keepalive": 3000 + } +} +``` +Only processor and processor_params should be different from a normal EAS service. +|params|details| +|--------|-------| +|processor|Name should be **triton** to use Triton on EAS| +|processor_params|List of strings, every element is a param for tritonserver | + +``` +./eascmd create triton.config +[RequestId]: AECDB6A4-CB69-4688-AA35-BA1E020C39E6 ++-------------------+------------------------------------------------------------------------------------------------+ +| Internet Endpoint | http://1271520832287160.cn-shanghai.pai-eas.aliyuncs.com/api/predict/test_triton_processor | +| Intranet Endpoint | http://1271520832287160.vpc.cn-shanghai.pai-eas.aliyuncs.com/api/predict/test_triton_processor | +| Token | MmY3M2ExZGYwYjZiMTQ5YTRmZWE3MDAzNWM1ZTBiOWQ3MGYxZGNkZQ== | ++-------------------+------------------------------------------------------------------------------------------------+ +[OK] Service is now deploying +[OK] Successfully synchronized resources +[OK] Waiting [Total: 1, Pending: 1, Running: 0] +[OK] Waiting [Total: 1, Pending: 1, Running: 0] +[OK] Running [Total: 1, Pending: 0, Running: 1] +[OK] Service is running +``` +## Query Triton service by python client +### Install triton's python client +``` +pip install tritonclient[all] +``` +### A demo to query inception model +``` +import numpy as np +import time +from PIL import Image + +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + +URL = "<servcice url>" +HEADERS = {"Authorization": "<service token>"} +input_img = httpclient.InferInput("input", [1, 299, 299, 3], "FP32") +# Using one of the cat images from imagenet or a random cat images you like +img = Image.open('./cat.png').resize((299, 299)) +img = np.asarray(img).astype('float32') / 255.0 +input_img.set_data_from_numpy(img.reshape([1, 299, 299, 3]), binary_data=True) + +output = httpclient.InferRequestedOutput( + "InceptionV3/Predictions/Softmax", binary_data=True +) +triton_client = httpclient.InferenceServerClient(url=URL, verbose=False) + +start = time.time() +for i in range(10): + results = triton_client.infer( + "inception_graphdef", inputs=[input_img], outputs=[output], headers=HEADERS + ) + res_body = results.get_response() + elapsed_ms = (time.time() - start) * 1000 + if i == 0: + print("model name: ", res_body["model_name"]) + print("model version: ", res_body["model_version"]) + print("output name: ", res_body["outputs"][0]["name"]) + print("output shape: ", res_body["outputs"][0]["shape"]) + print("[{}] Avg rt(ms): {:.2f}".format(i, elapsed_ms)) + start = time.time() +``` +You will get the following result by running the python script: +``` +[0] Avg rt(ms): 86.05 +[1] Avg rt(ms): 52.35 +[2] Avg rt(ms): 50.56 +[3] Avg rt(ms): 43.45 +[4] Avg rt(ms): 41.19 +[5] Avg rt(ms): 40.55 +[6] Avg rt(ms): 37.24 +[7] Avg rt(ms): 37.16 +[8] Avg rt(ms): 36.68 +[9] Avg rt(ms): 34.24 +[10] Avg rt(ms): 34.27 +``` +# Additional Resources +See the following resources to learn more about how to use Alibaba Cloud's OSS orEAS. +- [Alibaba Cloud OSS's Document](https://help.aliyun.com/product/31815.html?spm=a2c4g.11186623.6.540.3c0f62e7q3jw8b) + + +# Known Issues +- [Binary Tensor Data Extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md) is not fully supported yet, users want to use service with binary extension supported, it is only available in cn-shanghai region of PAI-EAS. +- Currently only HTTP/1 is supported, hence gRPC cannot be used when query Triton servers on EAS. HTP/2 will be officially supported in a short time. +- Users should not mount a whole OSS bucket when launching Triton processor, but an arbitrarily deep sub-directory in bucket. Otherwise the mounted path will no be as expected. +- Not all of Triton Server parameters are be supported on EAS, the following params are supported on EAS: +``` +model-repository +log-verbose +log-info +log-warning +log-error +exit-on-error +strict-model-config +strict-readiness +allow-http +http-thread-count +pinned-memory-pool-byte-size +cuda-memory-pool-byte-size +min-supported-compute-capability +buffer-manager-thread-count +backend-config +``` diff --git a/deploy/aws/Chart.yaml b/deploy/aws/Chart.yaml new file mode 100644 index 0000000000..2b7541bee6 --- /dev/null +++ b/deploy/aws/Chart.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +appVersion: "1.0" +description: Triton Inference Server +name: triton-inference-server +version: 1.0.0 diff --git a/deploy/aws/README.md b/deploy/aws/README.md new file mode 100644 index 0000000000..4e60fdd65b --- /dev/null +++ b/deploy/aws/README.md @@ -0,0 +1,262 @@ +<!-- +# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) + +# Kubernetes Deploy: Triton Inference Server Cluster + +A helm chart for installing a single cluster of Triton Inference +Server is provided. By default the cluster contains a single instance +of the inference server but the *replicaCount* configuration parameter +can be set to create a cluster of any size, as described below. + +This guide assumes you already have a functional Kubernetes cluster +and helm installed (see below for instructions on installing +helm). Note the following requirements: + +* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prpmetheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resources to support these services. + +* If you want Triton Server to use GPUs for inferencing, your cluster +must be configured to contain the desired number of GPU nodes (EC2 G4 instances recommended) +with support for the NVIDIA driver and CUDA version required by the version +of the inference server you are using. + +The steps below describe how to set-up a model repository, use helm to +launch the inference server, and then send inference requests to the +running server. You can access a Grafana endpoint to see real-time +metrics reported by the inference server. + +## Installing Helm + +### Helm v3 + +If you do not already have Helm installed in your Kubernetes cluster, +executing the following steps from the [official helm install +guide](https://helm.sh/docs/intro/install/) will +give you a quick setup. + +If you're currently using Helm v2 and would like to migrate to Helm v3, +please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/). + +### Helm v2 + +> **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3. + +Below are example instructions for installing Helm v2. + +``` +$ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash +$ kubectl create serviceaccount -n kube-system tiller +serviceaccount/tiller created +$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller +$ helm init --service-account tiller --wait +``` + +If you run into any issues, you can refer to the official installation guide [here](https://v2.helm.sh/docs/install/). + +## Model Repository + +If you already have a model repository you may use that with this helm +chart. If you do not have a model repository, you can checkout a local +copy of the inference server source repository to create an example +model repository:: + +``` +$ git clone https://github.com/triton-inference-server/server.git +``` + +Triton Server needs a repository of models that it will make available +for inferencing. For this example you will place the model repository +in an AWS S3 Storage bucket. + +``` +$ aws s3 mb s3://triton-inference-server-repository +``` + +Following the [QuickStart](../../docs/getting_started/quickstart.md) download the +example model repository to your system and copy it into the AWS S3 +bucket. + +``` +$ aws s3 cp --recursive docs/examples/model_repository s3://triton-inference-server-repository/model_repository +``` + +### AWS Model Repository +To load the model from the AWS S3, you need to convert the following AWS credentials in the base64 format and add it to the values.yaml + +``` +echo -n 'REGION' | base64 +``` +``` +echo -n 'SECRECT_KEY_ID' | base64 +``` +``` +echo -n 'SECRET_ACCESS_KEY' | base64 +``` + +## Deploy Prometheus and Grafana + +The inference server metrics are collected by Prometheus and viewable +by Grafana. The inference server helm chart assumes that Prometheus +and Grafana are available so this step must be followed even if you +don't want to use Grafana. + +Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) to install these components. The +*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that +Prometheus can find the inference server metrics in the *example* +release deployed below. + +``` +$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack +``` + +Then port-forward to the Grafana service so you can access it from +your local browser. + +``` +$ kubectl port-forward service/example-metrics-grafana 8080:80 +``` + +Now you should be able to navigate in your browser to localhost:8080 +and see the Grafana login page. Use username=admin and +password=prom-operator to login. + +An example Grafana dashboard is available in dashboard.json. Use the +import function in Grafana to import and view this dashboard. + +## Deploy the Inference Server + +Deploy the inference server using the default configuration with the +following commands. + +``` +$ cd <directory containing Chart.yaml> +$ helm install example . +``` + +Use kubectl to see status and wait until the inference server pods are +running. + +``` +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s +``` + +There are several ways of overriding the default configuration as +described in this [helm +documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing). + +You can edit the values.yaml file directly or you can use the *--set* +option to override a single parameter with the CLI. For example, to +deploy a cluster of four inference servers use *--set* to set the +replicaCount parameter. + +``` +$ helm install example --set replicaCount=4 . +``` + +You can also write your own "config.yaml" file with the values you +want to override and pass it to helm. + +``` +$ cat << EOF > config.yaml +namespace: MyCustomNamespace +image: + imageName: nvcr.io/nvidia/tritonserver:custom-tag + modelRepositoryPath: gs://my_model_repository +EOF +$ helm install example -f config.yaml . +``` + +## Using Triton Inference Server + +Now that the inference server is running you can send HTTP or GRPC +requests to it to perform inferencing. By default, the inferencing +service is exposed with a LoadBalancer service type. Use the following +to find the external IP for the inference server. In this case it is +34.83.9.133. + +``` +$ kubectl get services +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +... +example-triton-inference-server LoadBalancer 10.18.13.28 34.83.9.133 8000:30249/TCP,8001:30068/TCP,8002:32723/TCP 47m +``` + +The inference server exposes an HTTP endpoint on port 8000, and GRPC +endpoint on port 8001 and a Prometheus metrics endpoint on +port 8002. You can use curl to get the meta-data of the inference server +from the HTTP endpoint. + +``` +$ curl 34.83.9.133:8000/v2 +``` + +Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example +image classification client that can be used to perform inferencing +using image classification models being served by the inference +server. For example, + +``` +$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg +Request 0, batch size 1 +Image 'images/mug.jpg': + 504 (COFFEE MUG) = 0.723992 + 968 (CUP) = 0.270953 + 967 (ESPRESSO) = 0.00115997 +``` + +## Cleanup + +Once you've finished using the inference server you should use helm to +delete the deployment. + +``` +$ helm list +NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE +example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default +example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default + +$ helm uninstall example +$ helm uninstall example-metrics +``` + +For the Prometheus and Grafana services, you should [explicitly delete +CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart): + +``` +$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com +``` + +You may also want to delete the AWS bucket you created to hold the +model repository. + +``` +$ aws s3 rm -r gs://triton-inference-server-repository +``` diff --git a/deploy/aws/dashboard.json b/deploy/aws/dashboard.json new file mode 100644 index 0000000000..8960b41d35 --- /dev/null +++ b/deploy/aws/dashboard.json @@ -0,0 +1,411 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.3.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "nv_inference_request_success", + "legendFormat": "Success {{instance}}", + "refId": "A" + }, + { + "expr": "nv_inference_request_failure", + "legendFormat": "Failure {{instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cumulative Inference Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateReds", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 7, + "legend": { + "show": false + }, + "options": {}, + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Load Ratio (Total Time / Compute Time)", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Queue Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Compute Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Compute Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 19, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Triton Inference Server", + "uid": "slEY4dsZk", + "version": 8 +} diff --git a/deploy/aws/templates/_helpers.tpl b/deploy/aws/templates/_helpers.tpl new file mode 100644 index 0000000000..6dba910012 --- /dev/null +++ b/deploy/aws/templates/_helpers.tpl @@ -0,0 +1,92 @@ +{{/* +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/}} + +{{/* vim: set filetype=mustache: */}} +{{/* +Create inference server name. +*/}} +{{- define "triton-inference-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "triton-inference-server.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* + Create inference server metrics service name and fullname derived from above and + truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{/* + Create inference server metrics monitor name and fullname derived from + above and truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics-monitor.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics-monitor.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "triton-inference-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/deploy/aws/templates/deployment.yaml b/deploy/aws/templates/deployment.yaml new file mode 100644 index 0000000000..d90e51b113 --- /dev/null +++ b/deploy/aws/templates/deployment.yaml @@ -0,0 +1,100 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.imageName }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + + resources: + limits: + nvidia.com/gpu: {{ .Values.image.numGpus }} + + args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}", + "--model-control-mode=poll", + "--repository-poll-secs=5"] + + env: + - name: AWS_DEFAULT_REGION + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_DEFAULT_REGION + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SECRET_ACCESS_KEY + + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + livenessProbe: + httpGet: + path: /v2/health/live + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + httpGet: + path: /v2/health/ready + port: http + + securityContext: + runAsUser: 1000 + fsGroup: 1000 diff --git a/deploy/aws/templates/secrets.yaml b/deploy/aws/templates/secrets.yaml new file mode 100644 index 0000000000..d113214ee0 --- /dev/null +++ b/deploy/aws/templates/secrets.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Secret +metadata: + name: aws-credentials +type: Opaque +data: + AWS_DEFAULT_REGION: {{ .Values.secret.region }} + AWS_ACCESS_KEY_ID: {{ .Values.secret.id }} + AWS_SECRET_ACCESS_KEY: {{ .Values.secret.key }} diff --git a/deploy/aws/templates/service.yaml b/deploy/aws/templates/service.yaml new file mode 100644 index 0000000000..3315fd77db --- /dev/null +++ b/deploy/aws/templates/service.yaml @@ -0,0 +1,91 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 8000 + targetPort: http + name: http-inference-server + - port: 8001 + targetPort: grpc + name: grpc-inference-server + - port: 8002 + targetPort: metrics + name: metrics-inference-server + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server-metrics.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" +spec: + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "triton-inference-server-metrics-monitor.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics-monitor.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + selector: + matchLabels: + app: {{ template "triton-inference-server-metrics.name" . }} + endpoints: + - port: metrics + interval: 15s diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml new file mode 100644 index 0000000000..bd8ae0fe3b --- /dev/null +++ b/deploy/aws/values.yaml @@ -0,0 +1,41 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +replicaCount: 1 + +image: + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 + pullPolicy: IfNotPresent + modelRepositoryPath: s3://triton-inference-server-repository/model_repository + numGpus: 1 + +service: + type: LoadBalancer + +secret: + region: AWS_REGION + id: AWS_SECRET_KEY_ID + key: AWS_SECRET_ACCESS_KEY diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml new file mode 100644 index 0000000000..8feee92b3c --- /dev/null +++ b/deploy/fleetcommand/Chart.yaml @@ -0,0 +1,38 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +# appVersion is the Triton version; update when changing release +appVersion: "2.50.0" +description: Triton Inference Server (Fleet Command) +name: triton-inference-server +# version is the Chart version; update when changing anything in the chart +# This follows semantic versioning, i.e.: +# Given version X.Y.Z +# When making fixes to the chart, increment Z +# When making functional changes to the chart (including updating the Triton version, above), increment Y and reset Z to 0 +# When making breaking changes to the chart (e.g. user must take action before deploying), increment X and reset Y and Z to 0 +version: 1.4.0 diff --git a/deploy/fleetcommand/README.md b/deploy/fleetcommand/README.md new file mode 100644 index 0000000000..217162279c --- /dev/null +++ b/deploy/fleetcommand/README.md @@ -0,0 +1,150 @@ +<!-- +# Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) + +# Fleet Command Deploy: NVIDIA Triton Inference Server + +A helm chart for installing a single cluster of NVIDIA Triton Inference Server +on Fleet Command is provided. By default the cluster contains a single instance +of the Triton but the *replicaCount* configuration parameter can be set to +create a cluster of any size, as described below. + +This guide assumes you already have a functional Fleet Command location +deployed. Please refer to the [Fleet Command +Documentation](https://docs.nvidia.com/fleet-command/prod_fleet-command/prod_fleet-command/overview.html) + +The steps below describe how to set-up a model repository, use helm to launch +the Triton, and then send inference requests to the running Triton Inference +Server. You can optionally scrape metrics with Prometheus and access a Grafana +endpoint to see real-time metrics reported by Triton. + +## Model Repository + +If you already have a model repository you may use that with this helm chart. If +you do not have a model repository, you can checkout a local copy of the Triton +Inference Server source repository to create an example model repository:: + +``` +$ git clone https://github.com/triton-inference-server/server.git +``` + +Triton needs a repository of models that it will make available for inferencing. +For this example you will place the model repository in an S3 Storage bucket +(either in AWS or other S3 API compatible on-premises object storage). + +``` +$ aws s3 mb s3://triton-inference-server-repository +``` + +Following the [QuickStart](../../docs/getting_started/quickstart.md) download the example model +repository to your system and copy it into the AWS S3 bucket. + +``` +$ aws s3 cp -r docs/examples/model_repository s3://triton-inference-server-repository/model_repository +``` + +### AWS Model Repository + +To load the model from the AWS S3, you need to convert the following AWS +credentials in the base64 format and add it to the Application Configuration +section when creating the Fleet Command Deployment. + +``` +echo -n 'REGION' | base64 +echo -n 'SECRECT_KEY_ID' | base64 +echo -n 'SECRET_ACCESS_KEY' | base64 +# Optional for using session token +echo -n 'AWS_SESSION_TOKEN' | base64 +``` + +## Deploy the Triton Inference Server + +Deploy the Triton Inference Server to your Location in Fleet Command by creating +a Deployment. You can specify configuration parameters to override the default +[values.yaml](values.yaml) in the Application Configuration section. + +*Note:* You _must_ provide a `--model-repository` parameter with a path to your +prepared model repository in your S3 bucket. Otherwise, the Triton will not +start. + +An example Application Configuration for Triton on Fleet Command: +```yaml +image: + serverArgs: + - --model-repository=s3://triton-inference-server-repository + +secret: + region: <region in base 64 > + id: <access id in base 64 > + key: <access key in base 64> + token: <session token in base 64 (optional)> +``` + +See [Fleet Command documentation](https://docs.nvidia.com/fleet-command/prod_fleet-command/prod_fleet-command/ug-deploying-to-the-edge.html) +for more info. + +### Prometheus ServiceMonitor Support + +If you have `prometheus-operator` deployed, you can enable the ServiceMonitor +for the Triton Inference Server by setting `serviceMonitor.enabled: true` in +Application Configuration. This will also deploy a Grafana dashboard for Triton +as a ConfigMap. + +Otherwise, metrics can be scraped by pointing an external Prometheus +instance at the `metricsNodePort` in the values. + +## Using Triton Inference Server + +Now that the Triton Inference Server is running you can send HTTP or GRPC +requests to it to perform inferencing. By default, the service is exposed with a +NodePort service type, where the same port is opened on all systems in a +Location. + +Triton exposes an HTTP endpoint on port 30343, and GRPC endpoint on port 30344 +and a Prometheus metrics endpoint on port 30345. These ports can be overridden +in the application configuration when deploying. You can use curl to get the +meta-data of Triton from the HTTP endpoint. For example, if a system in your +location has the IP `34.83.9.133`: + +``` +$ curl 34.83.9.133:30343/v2 +``` + +Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example image +classification client that can be used to perform inferencing using image +classification models being served by the Triton. For example, + +``` +$ image_client -u 34.83.9.133:30343 -m densenet_onnx -s INCEPTION -c 3 mug.jpg +Request 0, batch size 1 +Image '/workspace/images/mug.jpg': + 15.349568 (504) = COFFEE MUG + 13.227468 (968) = CUP + 10.424893 (505) = COFFEEPOT +``` diff --git a/deploy/fleetcommand/dashboard.json b/deploy/fleetcommand/dashboard.json new file mode 100644 index 0000000000..5868176cbe --- /dev/null +++ b/deploy/fleetcommand/dashboard.json @@ -0,0 +1,419 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.3.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "nv_inference_request_success", + "legendFormat": "Success {{instance}}", + "refId": "A" + }, + { + "expr": "nv_inference_request_failure", + "legendFormat": "Failure {{instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cumulative Inference Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateReds", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 7, + "legend": { + "show": false + }, + "options": {}, + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Load Ratio (Total Time / Compute Time)", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Queue Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Compute Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Compute Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 19, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Triton Inference Server", + "uid": "slEY4dsZk", + "version": 8 +} diff --git a/deploy/fleetcommand/templates/_helpers.tpl b/deploy/fleetcommand/templates/_helpers.tpl new file mode 100644 index 0000000000..6dba910012 --- /dev/null +++ b/deploy/fleetcommand/templates/_helpers.tpl @@ -0,0 +1,92 @@ +{{/* +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/}} + +{{/* vim: set filetype=mustache: */}} +{{/* +Create inference server name. +*/}} +{{- define "triton-inference-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "triton-inference-server.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* + Create inference server metrics service name and fullname derived from above and + truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{/* + Create inference server metrics monitor name and fullname derived from + above and truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics-monitor.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics-monitor.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "triton-inference-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/deploy/fleetcommand/templates/configmap-grafana-dashboard.yaml b/deploy/fleetcommand/templates/configmap-grafana-dashboard.yaml new file mode 100644 index 0000000000..782b1f85e6 --- /dev/null +++ b/deploy/fleetcommand/templates/configmap-grafana-dashboard.yaml @@ -0,0 +1,37 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +{{- if .Values.serviceMonitor.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-dashboard-configmap + labels: + grafana_dashboard: "1" +data: + dashboard.json: |- +{{ .Files.Get "dashboard.json" | indent 4}} +{{- end }} diff --git a/deploy/fleetcommand/templates/deployment.yaml b/deploy/fleetcommand/templates/deployment.yaml new file mode 100644 index 0000000000..5d7af7023d --- /dev/null +++ b/deploy/fleetcommand/templates/deployment.yaml @@ -0,0 +1,112 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.imageName }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + + resources: + limits: + nvidia.com/gpu: {{ .Values.image.numGpus }} + + args: + - {{ .Values.image.serverCommand }} + {{- $args := required "image.serverArgs, at least --model-repository, is required!" .Values.image.serverArgs }} + {{- range $args }} + - {{ . -}} + {{ end }} + +{{ if .Values.secret }} + env: + - name: AWS_DEFAULT_REGION + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_DEFAULT_REGION + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SECRET_ACCESS_KEY +{{- if .Values.secret.token }} + - name: AWS_SESSION_TOKEN + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SESSION_TOKEN +{{- end }} +{{- end }} + + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + livenessProbe: + httpGet: + path: /v2/health/live + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + httpGet: + path: /v2/health/ready + port: http + + securityContext: + runAsUser: 1000 + fsGroup: 1000 diff --git a/deploy/fleetcommand/templates/secrets.yaml b/deploy/fleetcommand/templates/secrets.yaml new file mode 100644 index 0000000000..9c7dcd404d --- /dev/null +++ b/deploy/fleetcommand/templates/secrets.yaml @@ -0,0 +1,40 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +{{- if .Values.secret }} +apiVersion: v1 +kind: Secret +metadata: + name: aws-credentials +type: Opaque +data: + AWS_DEFAULT_REGION: {{ .Values.secret.region }} + AWS_ACCESS_KEY_ID: {{ .Values.secret.id }} + AWS_SECRET_ACCESS_KEY: {{ .Values.secret.key }} +{{- if .Values.secret.token }} + AWS_SESSION_TOKEN: {{ .Values.secret.token }} +{{- end }} +{{- end }} diff --git a/deploy/fleetcommand/templates/service.yaml b/deploy/fleetcommand/templates/service.yaml new file mode 100644 index 0000000000..4f12205902 --- /dev/null +++ b/deploy/fleetcommand/templates/service.yaml @@ -0,0 +1,102 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 8000 + targetPort: http + name: http-inference-server + {{- if .Values.service.httpNodePort }} + nodePort: {{ .Values.service.httpNodePort }} + {{- end }} + - port: 8001 + targetPort: grpc + name: grpc-inference-server + {{- if .Values.service.grpcNodePort }} + nodePort: {{ .Values.service.grpcNodePort }} + {{- end }} + - port: 8002 + targetPort: metrics + name: metrics-inference-server + {{- if .Values.service.metricsNodePort }} + nodePort: {{ .Values.service.metricsNodePort }} + {{- end }} + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server-metrics.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" +spec: + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "triton-inference-server-metrics-monitor.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics-monitor.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + selector: + matchLabels: + app: {{ template "triton-inference-server-metrics.name" . }} + endpoints: + - port: metrics + interval: 15s +{{- end }} diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml new file mode 100644 index 0000000000..dc5f37ca3b --- /dev/null +++ b/deploy/fleetcommand/values.yaml @@ -0,0 +1,79 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +replicaCount: 1 + +image: + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 + pullPolicy: IfNotPresent + numGpus: 1 + serverCommand: tritonserver + serverArgs: + # Model Repository Configuration (REQUIRED) + # + # Configure sources for model repository below. Multiple repositories + # can be specified + # + # To download models from an S3 bucket, uncomment and configure below + # To specify a non-AWS S3 endpoint, use the form + # s3://https://your-s3-endpoint:443/bucket/model_repository + # + #- --model-repository=s3://triton-inference-server-repository/model_repository + # + # Model Control Mode (Optional, default: none) + # + # To set model control mode, uncomment and configure below + # TODO: Fix the following url, it is invalid + # See https://github.com/triton-inference-server/server/blob/r24.09/docs/model_management.md + # for more details + #- --model-control-mode=explicit|poll|none + # + # Additional server args + # + # see https://github.com/triton-inference-server/server/blob/r24.09/README.md + # for more details + +service: + # for Fleet Command, type should be NodePort + type: NodePort + # the following ports will be the external port opened for each service + httpNodePort: 30343 + grpcNodePort: 30344 + metricsNodePort: 30345 + +# AWS +#secret: + # update the following with base64 encoded parameters +# region: AWS_REGION +# id: AWS_SECRET_KEY_ID +# key: AWS_SECRET_ACCESS_KEY +# token: AWS_SESSION_TOKEN + +# Prometheus-Operator ServiceMonitor support +# change enabled to 'true' to enable a ServiceMonitor if your cluster has +# Prometheus-Operator installed +serviceMonitor: + enabled: false diff --git a/deploy/gcp/Chart.yaml b/deploy/gcp/Chart.yaml new file mode 100644 index 0000000000..2b7541bee6 --- /dev/null +++ b/deploy/gcp/Chart.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +appVersion: "1.0" +description: Triton Inference Server +name: triton-inference-server +version: 1.0.0 diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md new file mode 100644 index 0000000000..dc80cc77de --- /dev/null +++ b/deploy/gcp/README.md @@ -0,0 +1,300 @@ +<!-- +# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) + +# Kubernetes Deploy: Triton Inference Server Cluster + +A helm chart for installing a single cluster of Triton Inference +Server is provided. By default the cluster contains a single instance +of the inference server but the *replicaCount* configuration parameter +can be set to create a cluster of any size, as described below. + +This guide assumes you already have a functional Kubernetes cluster +and helm installed (see below for instructions on installing +helm). Note the following requirements: + +* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. Your cluster must contain sufficient CPU resources to support these services. At a minimum you will likely require 2 CPU nodes with machine type of n1-standard-2 or greater. + +* If you want Triton Server to use GPUs for inferencing, your cluster +must be configured to contain the desired number of GPU nodes with +support for the NVIDIA driver and CUDA version required by the version +of the inference server you are using. + +This helm chart is available from [Triton Inference Server +GitHub](https://github.com/triton-inference-server/server) or from the +[NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com). + +The steps below describe how to set-up a model repository, use helm to +launch the inference server, and then send inference requests to the +running server. You can access a Grafana endpoint to see real-time +metrics reported by the inference server. + + +## Installing Helm + +### Helm v3 + +If you do not already have Helm installed in your Kubernetes cluster, +executing the following steps from the [official helm install +guide](https://helm.sh/docs/intro/install/) will +give you a quick setup. + +If you're currently using Helm v2 and would like to migrate to Helm v3, +please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/). + +### Helm v2 + +> **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3. + +Below are example instructions for installing Helm v2. + +``` +$ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash +$ kubectl create serviceaccount -n kube-system tiller +serviceaccount/tiller created +$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller +$ helm init --service-account tiller --wait +``` + +If you run into any issues, you can refer to the official installation guide [here](https://v2.helm.sh/docs/install/). + +## Model Repository + +If you already have a model repository you may use that with this helm +chart. If you do not have a model repository, you can checkout a local +copy of the inference server source repository to create an example +model repository:: + +``` +$ git clone https://github.com/triton-inference-server/server.git +``` + +Triton Server needs a repository of models that it will make available +for inferencing. For this example you will place the model repository +in a Google Cloud Storage bucket. + +``` +$ gsutil mb gs://triton-inference-server-repository +``` + +Following the [QuickStart](../../docs/getting_started/quickstart.md) download the +example model repository to your system and copy it into the GCS +bucket. + +``` +$ gsutil cp -r docs/examples/model_repository gs://triton-inference-server-repository/model_repository +``` + +### GCS Permissions + +Make sure the bucket permissions are set so that the inference server +can access the model repository. If the bucket is public then no +additional changes are needed and you can proceed to "Deploy +Prometheus and Grafana" section. + +If bucket premissions need to be set with the +GOOGLE_APPLICATION_CREDENTIALS environment variable then perform the +following steps: + +* Generate Google service account JSON with proper permissions called + *gcp-creds.json*. + +* Create a Kubernetes secret from *gcp-creds.json*: + +``` + $ kubectl create configmap gcpcreds --from-literal "project-id=myproject" + $ kubectl create secret generic gcpcreds --from-file gcp-creds.json +``` + +* Modify templates/deployment.yaml to include the + GOOGLE_APPLICATION_CREDENTIALS environment variable: + +``` + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-creds.json +``` + +* Modify templates/deployment.yaml to mount the secret in a volume at + /secret: + +``` + volumeMounts: + - name: vsecret + mountPath: "/secret" + readOnly: true + ... + volumes: + - name: vsecret + secret: + secretName: gcpcreds +``` + + +## Deploy Prometheus and Grafana + +The inference server metrics are collected by Prometheus and viewable +by Grafana. The inference server helm chart assumes that Prometheus +and Grafana are available so this step must be followed even if you +don't want to use Grafana. + +Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) to install these components. The +*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that +Prometheus can find the inference server metrics in the *example* +release deployed below. + +``` +$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack +``` + +Then port-forward to the Grafana service so you can access it from +your local browser. + +``` +$ kubectl port-forward service/example-metrics-grafana 8080:80 +``` + +Now you should be able to navigate in your browser to localhost:8080 +and see the Grafana login page. Use username=admin and +password=prom-operator to login. + +An example Grafana dashboard is available in dashboard.json. Use the +import function in Grafana to import and view this dashboard. + +## Deploy the Inference Server + +Deploy the inference server using the default configuration with the +following commands. + +``` +$ cd <directory containing Chart.yaml> +$ helm install example . +``` + +Use kubectl to see status and wait until the inference server pods are +running. + +``` +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s +``` + +There are several ways of overriding the default configuration as +described in this [helm +documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing). + +You can edit the values.yaml file directly or you can use the *--set* +option to override a single parameter with the CLI. For example, to +deploy a cluster of four inference servers use *--set* to set the +replicaCount parameter. + +``` +$ helm install example --set replicaCount=4 . +``` + +You can also write your own "config.yaml" file with the values you +want to override and pass it to helm. + +``` +$ cat << EOF > config.yaml +namespace: MyCustomNamespace +image: + imageName: nvcr.io/nvidia/tritonserver:custom-tag + modelRepositoryPath: gs://my_model_repository +EOF +$ helm install example -f config.yaml . +``` + +## Using Triton Inference Server + +Now that the inference server is running you can send HTTP or GRPC +requests to it to perform inferencing. By default, the inferencing +service is exposed with a LoadBalancer service type. Use the following +to find the external IP for the inference server. In this case it is +34.83.9.133. + +``` +$ kubectl get services +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +... +example-triton-inference-server LoadBalancer 10.18.13.28 34.83.9.133 8000:30249/TCP,8001:30068/TCP,8002:32723/TCP 47m +``` + +The inference server exposes an HTTP endpoint on port 8000, and GRPC +endpoint on port 8001 and a Prometheus metrics endpoint on +port 8002. You can use curl to get the meta-data of the inference server +from the HTTP endpoint. + +``` +$ curl 34.83.9.133:8000/v2 +``` + +Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example +image classification client that can be used to perform inferencing +using image classification models being served by the inference +server. For example, + +``` +$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg +Request 0, batch size 1 +Image 'images/mug.jpg': + 504 (COFFEE MUG) = 0.723992 + 968 (CUP) = 0.270953 + 967 (ESPRESSO) = 0.00115997 +``` + +## Cleanup + +Once you've finished using the inference server you should use helm to +delete the deployment. + +``` +$ helm list +NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE +example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default +example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default + +$ helm uninstall example +$ helm uninstall example-metrics +``` + +For the Prometheus and Grafana services, you should [explicitly delete +CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart): + +``` +$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com +``` + +You may also want to delete the GCS bucket you created to hold the +model repository. + +``` +$ gsutil rm -r gs://triton-inference-server-repository +``` diff --git a/deploy/gcp/dashboard.json b/deploy/gcp/dashboard.json new file mode 100644 index 0000000000..8960b41d35 --- /dev/null +++ b/deploy/gcp/dashboard.json @@ -0,0 +1,411 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.3.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "nv_inference_request_success", + "legendFormat": "Success {{instance}}", + "refId": "A" + }, + { + "expr": "nv_inference_request_failure", + "legendFormat": "Failure {{instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cumulative Inference Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateReds", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 7, + "legend": { + "show": false + }, + "options": {}, + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Load Ratio (Total Time / Compute Time)", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Queue Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Compute Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Compute Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 19, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Triton Inference Server", + "uid": "slEY4dsZk", + "version": 8 +} diff --git a/deploy/gcp/templates/_helpers.tpl b/deploy/gcp/templates/_helpers.tpl new file mode 100644 index 0000000000..6dba910012 --- /dev/null +++ b/deploy/gcp/templates/_helpers.tpl @@ -0,0 +1,92 @@ +{{/* +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/}} + +{{/* vim: set filetype=mustache: */}} +{{/* +Create inference server name. +*/}} +{{- define "triton-inference-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "triton-inference-server.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* + Create inference server metrics service name and fullname derived from above and + truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{/* + Create inference server metrics monitor name and fullname derived from + above and truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics-monitor.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics-monitor.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "triton-inference-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/deploy/gcp/templates/deployment.yaml b/deploy/gcp/templates/deployment.yaml new file mode 100644 index 0000000000..b7592c7043 --- /dev/null +++ b/deploy/gcp/templates/deployment.yaml @@ -0,0 +1,81 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.imageName }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + + resources: + limits: + nvidia.com/gpu: {{ .Values.image.numGpus }} + + args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}"] + + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + livenessProbe: + httpGet: + path: /v2/health/live + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + httpGet: + path: /v2/health/ready + port: http + + securityContext: + runAsUser: 1000 + fsGroup: 1000 diff --git a/deploy/gcp/templates/service.yaml b/deploy/gcp/templates/service.yaml new file mode 100644 index 0000000000..3315fd77db --- /dev/null +++ b/deploy/gcp/templates/service.yaml @@ -0,0 +1,91 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 8000 + targetPort: http + name: http-inference-server + - port: 8001 + targetPort: grpc + name: grpc-inference-server + - port: 8002 + targetPort: metrics + name: metrics-inference-server + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server-metrics.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" +spec: + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "triton-inference-server-metrics-monitor.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics-monitor.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + selector: + matchLabels: + app: {{ template "triton-inference-server-metrics.name" . }} + endpoints: + - port: metrics + interval: 15s diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml new file mode 100644 index 0000000000..c5427c151e --- /dev/null +++ b/deploy/gcp/values.yaml @@ -0,0 +1,36 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +replicaCount: 1 + +image: + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 + pullPolicy: IfNotPresent + modelRepositoryPath: gs://triton-inference-server-repository/model_repository + numGpus: 1 + +service: + type: LoadBalancer diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md new file mode 100644 index 0000000000..595d4634ab --- /dev/null +++ b/deploy/gke-marketplace-app/README.md @@ -0,0 +1,201 @@ +<!-- +# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +# NVIDIA Triton Inference Server GKE Marketplace Application + +**Table Of Contents** +- [NVIDIA Triton Inference Server GKE Marketplace Application](#nvidia-triton-inference-server-gke-marketplace-application) + - [Description](#description) + - [Prerequisites](#prerequisites) + - [Demo Instruction](#demo-instruction) + - [Additional Resources](#additional-resources) + - [Known Issues](#known-issues) + +## Description + +This repository contains Google Kubernetes Engine(GKE) Marketplace Application for NVIDIA Triton Inference Server deployer. + + - Triton GKE deployer is a helm chart deployer recommended by GKE Marketplace + - Triton GKE deployer deploys a GKE ingress which accepts public inference requests + - Triton GKE deployer includes a horizontal pod autoscaler(HPA) which relies on [stack driver custom metrics adaptor](https://github.com/GoogleCloudPlatform/k8s-stackdriver/tree/master/custom-metrics-stackdriver-adapter) to monitor GPU duty cycle, and auto scale GPU nodes. + - This repo also contains a sample to generate BERT model with TensorRT and use Locust to experiment with GPU node autoscaling and monitor client latency/throughput. + +![Cloud Architecture Diagram](diagram.png) + +## Prerequisites + + - [Install Google Cloud SDK on your laptop/client workstation](https://cloud.google.com/sdk/docs/install), so that `gcloud` SDK cli interface could be run on the client and sign in with your GCP credentials. + - In addition, user could leverage [Google Cloud shell](https://cloud.google.com/shell/docs/launching-cloud-shell). + +## Demo Instruction + +First, install this Triton GKE app to an existing GKE cluster with GPU node pool, Google Cloud Marketplace currently doesn't support auto creation of GPU clusters. User has to run following command to create a compatible cluster (gke version >=1.18.7) with GPU node pools, we recommend user to select T4 or A100(MIG) instances type and choose CPU ratio based on profiling of actual inference workflow. + +Users need to follow these [instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/kubernetes-service-accounts#creating_a_kubernetes_service_account) to create a kubernetes service account. In this example, we use `gke-test@k80-exploration.iam.gserviceaccount.com`. Make sure it has access to artifact registry and monitoring viewer. For example, to grant access to custom metrics which is required for HPA to work: +``` +gcloud iam service-accounts add-iam-policy-binding --role \ + roles/iam.workloadIdentityUser --member \ + "serviceAccount:<project-id>.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \ + <google-service-account>@<project-id>.iam.gserviceaccount.com + +kubectl annotate serviceaccount --namespace custom-metrics \ + custom-metrics-stackdriver-adapter \ + iam.gke.io/gcp-service-account=<google-service-account>@<project-id>.iam.gserviceaccount.com +``` + +Currently, GKE >= 1.18.7 only supported in GKE rapid channel, to find the latest version, please visit [GKE release notes](https://cloud.google.com/kubernetes-engine/docs/release-notes). +``` +export PROJECT_ID=<your GCP project ID> +export ZONE=<GCP zone of your choice> +export REGION=<GCP region of your choice> +export DEPLOYMENT_NAME=<GKE cluster name, triton-gke for example> +# example: export SERVICE_ACCOUNT="gke-test@k80-exploration.iam.gserviceaccount.com" +export SERVICE_ACCOUNT=<Your GKE service account> + +gcloud beta container clusters create ${DEPLOYMENT_NAME} \ +--addons=HorizontalPodAutoscaling,HttpLoadBalancing \ +--service-account=${SERVICE_ACCOUNT} \ +--machine-type=n1-standard-8 \ +--node-locations=${ZONE} \ +--monitoring=SYSTEM \ +--zone=${ZONE} \ +--subnetwork=default \ +--scopes cloud-platform \ +--num-nodes 1 \ +--project ${PROJECT_ID} + +# add GPU node pools, user can modify number of node based on workloads +gcloud container node-pools create accel \ + --project ${PROJECT_ID} \ + --zone ${ZONE} \ + --cluster ${DEPLOYMENT_NAME} \ + --service-account=${SERVICE_ACCOUNT} \ + --num-nodes 2 \ + --accelerator type=nvidia-tesla-t4,count=1 \ + --enable-autoscaling --min-nodes 2 --max-nodes 3 \ + --machine-type n1-standard-4 \ + --disk-size=100 \ + --scopes cloud-platform \ + --verbosity error + +# so that you can run kubectl locally to the cluster +gcloud container clusters get-credentials ${DEPLOYMENT_NAME} --project ${PROJECT_ID} --zone ${ZONE} + +# deploy NVIDIA device plugin for GKE to prepare GPU nodes for driver install +kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml + +# make sure you can run kubectl locally to access the cluster +kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user "$(gcloud config get-value account)" + +# enable stackdriver custom metrics adaptor +kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml + +# create an ip for ingress traffic +gcloud compute addresses create ingress-triton --global +``` + +Creating a cluster and adding GPU nodes could take up-to 10 minutes. Please be patient after executing this command. GPU resources in GCP could be fully utilized, so please try a different zone in case compute resource cannot be allocated. After GKE cluster is running, run `kubectl get pods --all-namespaces` to make sure the client can access the cluster correctly: + +If user would like to experiment with A100 MIG partitioned GPU in GKE, please create node pool with following command: +``` +gcloud beta container node-pools create accel \ + --project ${PROJECT_ID} \ + --zone ${ZONE} \ + --cluster ${DEPLOYMENT_NAME} \ + --service-account=${SERVICE_ACCOUNT} \ + --num-nodes 1 \ + --accelerator type=nvidia-tesla-a100,count=1,gpu-partition-size=1g.5gb \ + --enable-autoscaling --min-nodes 1 --max-nodes 2 \ + --machine-type=a2-highgpu-1g \ + --disk-size=100 \ + --scopes cloud-platform \ + --verbosity error +``` + +Please note that A100 MIG in GKE does not support GPU metrics yet, also Triton GPU Metrics is not compatible with A100 MIG. Hence, please disable GPU metrics by unselect allowGPUMetrics while deploy Triton GKE app. Also for the same reason, this deployer doesn't support inference workfload auto-scaling on A100 MIG as well. + +Second, go to this [GKE Marketplace link](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/triton-inference-server) to deploy Triton application. + +Users can leave everything as default if their models have already been tested/validated with Triton. They can provide a GCS path pointing to the model repository containing their models. By default, we provide a BERT large model optimized by TensorRT in a public demo GCS bucket that is compatible with the `xx.yy` release of Triton Server in `gs://triton_sample_models/xx_yy`. However, please take note of the following about this demo bucket: +- The TensorRT engine provided in the demo bucket is only compatible with Tesla T4 GPUs. +- This bucket is located in `us-central1`, so loading from this bucket into Triton in other regions may be affected. +- The first deployment of this Triton GKE application will be slower than consecutive runs because the image needs to be pulled into the GKE cluster. +- You can find an example of how this model is generated and uploaded [here](trt-engine/README.md). + +Where <xx.yy> is the version of NGC Triton container needed. + +![GKE Marketplace Application UI](ui.png) + +We want to discuss HPA autoscaling metrics users can leverage. GPU Power(Percentage of Power) tends to be a reliable metric, especially for larger GPU like V100 and A100. GKE currently natively support GPU duty cycle which is GPU utilization in `nvidia-smi`. We ask users always profile their model to determine the autoscaling target and metrics. When attempting to select the right metrics for autoscaling, the goal should be to pick metrics based on the following: 1, meet SLA rrequirement. 2, give consideration to transient request load, 3, keep GPU as fully utilized as possible. Profiling comes in 2 aspects: If user decided to use Duty Cycle or other GPU metric, it is recommend establish baseline to link SLA requirement such as latency with GPU metrics, for example, for model A, latency will be below 10ms 99% of time when Duty Cycle is below 80% utilized. Additionally, profiling also provide insight to model optimization for inference, with tools like [Nsight](https://developer.nvidia.com/nsight-systems). + +Once the application is deployed successfully, get the public ip from ingress: +``` +> kubectl get ingress +NAME CLASS HOSTS ADDRESS PORTS AGE +triton-external <none> * 35.186.215.182 80 107s +``` + +Third, we will try sending request to server with provide client example. + +If User selected deploy Triton to accept HTTP request, please launch [Locust](https://docs.locust.io/en/stable/installation.html) with Ingress host and port to query Triton Inference Server. In this [example script](https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app/client-sample/locustfile_bert.py), we send request to Triton server which has loaded a BERT large TensorRT Engine with Sequence length of 128 into GCP bucket. We simulate 1000 concurrent user as target and spawn user at rate of 50 users per second. +``` +locust -f locustfile_bert.py -H http://${INGRESS_HOST}:${INGRESS_PORT} +``` + +The client example push about ~650 QPS(Query per second) to Triton Server, and will trigger a auto scale of T4 GPU nodes (We recommend to use T4 and A100[MIG] for inference). From locust UI, we will observer a drop of latency mean and variance for the requests. At the end, after autoscaling, we see the latency stablized at ~200 ms, end to end from US client to europe server, which is excellent for a model that has 345 million parameters. Since for each node, we use 1T4 + n1-standard-4 instance, and it can handle ~450 QPS, with on-demand price, it is ($0.35+$0.19)=$0.54/hr, that translate to 3 million inference per dollar for BERT large model at batch size 1. Further more, with 3 year commitment price, hr rate is ($0.16+$0.08)=$0.24/hr, that translate to 6.75 million inference per dollar. + +![Locust Client Chart](client.png) + +Alternatively, user can opt to use +[Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) +to profile and study the performance of Triton Inference Server. Here we also +provide a +[client script](https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh) +to use Perf Analyzer to send gRPC to Triton Server GKE deployment. Perf Analyzer +client requires user to use NGC Triton Client Container. + +``` +bash perf_analyzer_grpc.sh ${INGRESS_HOST}:${INGRESS_PORT} +``` + +## Additional Resources + +See the following resources to learn more about NVIDIA Triton Inference Server and GKE GPU capabilities. + +**Documentation** + +- [GPU in Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus) +- [Optimize GPU Performance in Google Cloud Platform](https://cloud.google.com/compute/docs/gpus/optimize-gpus) +- [Triton Inference Server](https://github.com/triton-inference-server/server) +- [AI Platform Prediction: Custom container concepts with Triton Server](https://cloud.google.com/solutions/ai-platform-prediction-custom-container-concepts) by [Kevin Tsai](https://github.com/merlin1649) +- [AI Platform Prediction: Direct model server setup for NVIDIA Triton Inference Server](https://cloud.google.com/solutions/ai-platform-prediction-direct-model-server-nvidia) by [Kevin Tsai](https://github.com/merlin1649) + +## Known Issues + +- GKE one click cluster creation doesn't support GPU node pools at the moment, users have to manually create a compatible (>=1.18.7) cluster and attach node pool (T4 and A100 MIG recommended) +- When Horizontal Pod Autoscaler(HPA) expand and all GPU node pool already utilized, GKE will request new GPU node and it can take between 4-7 minutes, it could be a long wait plus GPU driver install and image pulling. We recommend user to leverage multi-tier model serving and Triton's priority feature to create cushion for latency critical models, and allocate active standby GPU node for spike of requests. diff --git a/deploy/gke-marketplace-app/benchmark/README.md b/deploy/gke-marketplace-app/benchmark/README.md new file mode 100644 index 0000000000..5138148035 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/README.md @@ -0,0 +1,95 @@ +<!-- +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +# Benchmarking with NVIDIA Triton Inference Server GKE Marketplace Application + +**Table Of Contents** +- [Models](#models) +- [Performance](#performance) + +## Models + +First, we collect a set of TensorFlow and TensorRT models to compare: + +- Get [Distill Bert fine-tuned with Squad Q&A task](https://huggingface.co/distilbert-base-cased-distilled-squad/tree/main) from Huggingface. `wget https://huggingface.co/distilbert-base-cased-distilled-squad/blob/main/saved_model.tar.gz` +- Get [Bert base fine-tuned with Squad Q&A task](https://huggingface.co/deepset/bert-base-cased-squad2/tree/main) from Huggingface `wget https://huggingface.co/deepset/bert-base-cased-squad2/blob/main/saved_model.tar.gz` +- Follow [TensorRT Demo Bert](https://github.com/NVIDIA/TensorRT/tree/master/demo/BERT) to convert BERT base model to TensorRT Engine, choose sequence length of 384 to match previous 2 TensorFlow models. Last step, we choose to create TensorRT engine with 2 optimization profile, profile 0 for batch size 1 and profile 1 for batch size 4 run: `python3 builder.py -m models/fine-tuned/bert_tf_ckpt_base_qa_squad2_amp_384_v19.03.1/model.ckpt -o engines/model.plan -b 8 -s 384 --fp16 --int8 --strict -c models/fine-tuned/bert_tf_ckpt_base_qa_squad2_amp_384_v19.03.1 --squad-json ./squad/train-v2.0.json -v models/fine-tuned/bert_tf_ckpt_base_qa_squad2_amp_384_v19.03.1/vocab.txt --calib-num 100 -iln -imh`. This needs to be ran on the inference GPU respectively (Engine optimized with A100 cannot be used for inference on T4). + +We the place the model into a GCS with following structure, `config.pbtxt` was provided. +``` + ├── bert_base_trt_gpu + │ ├── 1 + │ │ └── model.plan + │ └── config.pbtxt + ├── bert_base_trt_gpu_seqlen128 + │ ├── 1 + │ │ └── model.plan + │ └── config.pbtxt + ├── bert_base_tf_gpu + │ ├── 1 + │ │ └── model.savedmodel + │ └── config.pbtxt + ├── bert_base_tf_cpu + │ ├── 1 + │ │ └── model.savedmodel + │ └── config.pbtxt + ├── bert_distill_tf_gpu + │ ├── 1 + │ │ └── model.savedmodel + │ └── config.pbtxt + └── bert_distill_tf_cpu + ├── 1 + │ └── model.savedmodel + └── config.pbtxt +``` + +When deploy Triton GKE application, point the model repository to directory contains the structure above with actual models. + +## Performance + +We use perf analyzer of Triton to benchmark the performance of each model, the perf analyzer reside in another pod of the GKE cluster. +```bash +export INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +export INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].port}') +bash perf_query.sh 35.194.5.119:80 bert_base_trt_gpu 384 +``` + +We deploy model on n1-standard-96 for CPU BERT BASE and Distill BERT and (n1-standard-4 + T4) for GPU BERT models, the sequence length of the BERT model is 384 token, and measure the latency/throughput with a concurrency sweep with Triton's performance analyzer. The latency includes Istio ingress/load balancing and reflect the true round trip cost in the same GCP zone. + +For all the model with sequence length of 384: +CPU BERT BASE: latency: 700ms, throughput: 12 qps +CPU Distill BERT: latency: 369ms, throughput: 24 qps + +GPU BERT BASE: latency: 230ms, throughput: 34.7 qps +GPU Distill BERT: latency: 118ms, throughput: 73.3 qps +GPU TensorRT BERT BASE: latency: 50ms, throughput: 465 qps + +With n1-standard-96 priced at $4.56/hr and n1-standard-4 at $0.19/hr and T4 at $0.35/hr totaling $0.54/hr. While achieving a much lower latency, the TCO of BERT inference with TensorRT on T4 is over 163 times that of Distill BERT inference on n1-standard-96. + + + diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_cpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_cpu/config.pbtxt new file mode 100644 index 0000000000..3bfccb5c45 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_cpu/config.pbtxt @@ -0,0 +1,35 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +max_batch_size: 4 +dynamic_batching { + preferred_batch_size: 1 + max_queue_delay_microseconds: 2000000 +} +instance_group { + count: 2 + kind: KIND_CPU +} diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt new file mode 100644 index 0000000000..b6ca32f9a2 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt @@ -0,0 +1,35 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +max_batch_size: 4 +dynamic_batching { + preferred_batch_size: 4 + max_queue_delay_microseconds: 200000 +} +instance_group { + count: 2 + kind: KIND_GPU +} diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt new file mode 100644 index 0000000000..acbd124bf2 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt @@ -0,0 +1,38 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "tensorrt_plan" +max_batch_size: 4 +dynamic_batching { + preferred_batch_size: 4 + max_queue_delay_microseconds: 200000 +} +instance_group { + count: 2 + profile: "1" + kind: KIND_GPU +} + diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu_seqlen128/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu_seqlen128/config.pbtxt new file mode 100644 index 0000000000..2ee39e7dbc --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu_seqlen128/config.pbtxt @@ -0,0 +1,37 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "tensorrt_plan" +max_batch_size: 8 +dynamic_batching { + preferred_batch_size: 8 + max_queue_delay_microseconds: 200000 +} +instance_group { + count: 2 + kind: KIND_GPU +} + diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt new file mode 100644 index 0000000000..c8e8074309 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt @@ -0,0 +1,35 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +max_batch_size: 4 +dynamic_batching { + preferred_batch_size: 1 + max_queue_delay_microseconds: 2000000 +} +instance_group { + count: 2 + kind: KIND_CPU +} diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt new file mode 100644 index 0000000000..b6ca32f9a2 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt @@ -0,0 +1,35 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +max_batch_size: 4 +dynamic_batching { + preferred_batch_size: 4 + max_queue_delay_microseconds: 200000 +} +instance_group { + count: 2 + kind: KIND_GPU +} diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh new file mode 100755 index 0000000000..0ce6e120b7 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +SERVER_HOST=${1:-"${INGRESS_HOST}:${INGRESS_PORT}"} # need update public IP +MODEL_NAME=${2:-"${MODEL_NAME}"} +SEQ_LENGTH=${3:-"${SEQ_LEN}"} +BATCH_SIZE=${4:-2} +MAX_LATENCY=${5:-5000} +MAX_CLIENT_THREADS=${6:-20} +MAX_CONCURRENCY=${7:-24} +MODEL_VERSION=${8:-1} +precision=${9:-"fp32"} +PERFCLIENT_PERCENTILE=${10:-90} +MAX_TRIALS=${12:-40} + +ARGS="\ + --max-threads ${MAX_CLIENT_THREADS} \ + -m ${MODEL_NAME} \ + -x ${MODEL_VERSION} \ + -p 3000 \ + --async \ + --concurrency-range 4:${MAX_CONCURRENCY}:2 \ + -r ${MAX_TRIALS} \ + -v \ + -i HTTP \ + -u ${SERVER_HOST} \ + -b ${BATCH_SIZE} \ + -l ${MAX_LATENCY} \ + -z \ + --percentile=${PERFCLIENT_PERCENTILE}" + +echo "Using args: $(echo "$ARGS" | sed -e 's/ -/\n-/g')" + +/workspace/install/bin/perf_client $ARGS -f perf.csv \ No newline at end of file diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml new file mode 100644 index 0000000000..a63a12ce34 --- /dev/null +++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml @@ -0,0 +1,42 @@ +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Pod +metadata: + labels: + app: nv-triton-client + name: nv-triton-client + namespace: default +spec: + containers: + - image: nvcr.io/nvidia/tritonserver:24.09-py3-sdk + imagePullPolicy: Always + name: nv-triton-client + securityContext: + privileged: true + command: [ "/bin/bash", "-c", "--" ] + args: [ "while true; do sleep 30; done;" ] diff --git a/deploy/gke-marketplace-app/client-sample/bert_request.json b/deploy/gke-marketplace-app/client-sample/bert_request.json new file mode 100644 index 0000000000..ce4b956db6 --- /dev/null +++ b/deploy/gke-marketplace-app/client-sample/bert_request.json @@ -0,0 +1,27 @@ +{ + "inputs": [{ + "name": "input_ids", + "shape": [1, 128], + "datatype": "INT32", + "parameters": {}, + "data": [101, 2054, 2003, 23435, 5339, 1029, 102, 23435, 5339, 2003, 1037, 2152, 2836, 2784, 4083, 28937, 4132, 2008, 18058, 2659, 2397, 9407, 1998, 2152, 2083, 18780, 2005, 18726, 2107, 2004, 16755, 2545, 1010, 4613, 1998, 3746, 1013, 2678, 2006, 1050, 17258, 2401, 14246, 2271, 1012, 2009, 2950, 11968, 8043, 2015, 2000, 12324, 4275, 1010, 1998, 13354, 7076, 2000, 2490, 3117, 23092, 1998, 9014, 2077, 11243, 20600, 2015, 2005, 28937, 1012, 2651, 1050, 17258, 2401, 2003, 2330, 1011, 14768, 6129, 11968, 8043, 2015, 1998, 13354, 7076, 1999, 23435, 5339, 2061, 2008, 1996, 2784, 4083, 2451, 2064, 7661, 4697, 1998, 7949, 2122, 6177, 2000, 2202, 5056, 1997, 3928, 23435, 5339, 20600, 2015, 2005, 2115, 18726, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + }, { + "name": "input_mask", + "shape": [1, 128], + "datatype": "INT32", + "parameters": {}, + "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + }, { + "name": "segment_ids", + "shape": [1, 128], + "datatype": "INT32", + "parameters": {}, + "data": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + }], + "outputs": [{ + "name": "cls_squad_logits", + "parameters": { + "binary_data": false + } + }] +} diff --git a/deploy/gke-marketplace-app/client-sample/locustfile_bert.py b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py new file mode 100755 index 0000000000..aae8c69f43 --- /dev/null +++ b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +from locust import HttpUser, LoadTestShape, between, task + + +class ProfileLoad(LoadTestShape): + """ + This load profile starts at 0 and steps up by step_users + increments every tick, up to target_users. After reaching + target_user level, load will stay at target_user level + until time_limit is reached. + """ + + target_users = 1000 + step_users = 50 # ramp users each step + time_limit = 3600 # seconds + + def tick(self): + num_steps = self.target_users / self.step_users + run_time = round(self.get_run_time()) + + if run_time < self.time_limit: + if num_steps < run_time: + user_count = num_steps * self.step_users + else: + user_count = self.target_users + return (user_count, self.step_users) + else: + return None + + +class TritonUser(HttpUser): + wait_time = between(0.2, 0.2) + + @task() + def bert(self): + response = self.client.post(self.url1, data=json.dumps(self.data)) + + def on_start(self): + with open("bert_request.json") as f: + self.data = json.load(f) + + self.url1 = "{}/v2/models/{}/infer".format(self.environment.host, "bert") diff --git a/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh b/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh new file mode 100755 index 0000000000..ae5476f338 --- /dev/null +++ b/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +SERVER_HOST=${1:-"${INGRESS_HOST}:${INGRESS_PORT}"} # need update public IP +MODEL_VERSION=${2:-1} +precision=${3:-"int8"} +BATCH_SIZE=${4:-1} +MAX_LATENCY=${5:-500} +MAX_CLIENT_THREADS=${6:-6} +MAX_CONCURRENCY=${7:-20} +MODEL_NAME=${8:-"bert"} +SEQ_LENGTH=${9:-"128"} +PERFCLIENT_PERCENTILE=${10:-90} +STABILITY_PERCENTAGE=${11:-0.01} +MAX_TRIALS=${12:-1000000} + +ARGS="\ + --max-threads ${MAX_CLIENT_THREADS} \ + -m ${MODEL_NAME} \ + -x ${MODEL_VERSION} \ + -p 1000 \ + -t ${MAX_CONCURRENCY} \ + -s ${STABILITY_PERCENTAGE} \ + -r ${MAX_TRIALS} \ + -v \ + -i gRPC \ + -u ${SERVER_HOST} \ + -b ${BATCH_SIZE} \ + -l ${MAX_LATENCY} \ + -z \ + --shape=input_ids:${SEQ_LENGTH} \ + --shape=segment_ids:${SEQ_LENGTH} \ + --shape=input_mask:${SEQ_LENGTH} \ + --percentile=${PERFCLIENT_PERCENTILE}" + +echo "Using args: $(echo "$ARGS" | sed -e 's/ -/\n-/g')" + +/workspace/install/bin/perf_client $ARGS diff --git a/deploy/gke-marketplace-app/client.png b/deploy/gke-marketplace-app/client.png new file mode 100644 index 0000000000..1fe3dbe7d5 Binary files /dev/null and b/deploy/gke-marketplace-app/client.png differ diff --git a/deploy/gke-marketplace-app/diagram.png b/deploy/gke-marketplace-app/diagram.png new file mode 100644 index 0000000000..7592672e94 Binary files /dev/null and b/deploy/gke-marketplace-app/diagram.png differ diff --git a/deploy/gke-marketplace-app/server-deployer/Dockerfile b/deploy/gke-marketplace-app/server-deployer/Dockerfile new file mode 100644 index 0000000000..5bb34adc65 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/Dockerfile @@ -0,0 +1,28 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FROM gcr.io/cloud-marketplace-tools/k8s/deployer_helm/onbuild + diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh new file mode 100755 index 0000000000..19d84816a0 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/') +export APP_NAME=tritonserver +export MAJOR_VERSION=2.50 +export MINOR_VERSION=2.50.0 +export NGC_VERSION=24.09-py3 + +docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION + +docker tag nvcr.io/nvidia/$APP_NAME:$NGC_VERSION $REGISTRY/$APP_NAME:$MAJOR_VERSION +docker tag nvcr.io/nvidia/$APP_NAME:$NGC_VERSION $REGISTRY/$APP_NAME:$MINOR_VERSION +docker tag nvcr.io/nvidia/$APP_NAME:$NGC_VERSION $REGISTRY/$APP_NAME:$NGC_VERSION + +docker push $REGISTRY/$APP_NAME:$MINOR_VERSION +docker push $REGISTRY/$APP_NAME:$MAJOR_VERSION +docker push $REGISTRY/$APP_NAME:$NGC_VERSION + +docker build --tag $REGISTRY/$APP_NAME/deployer . + +docker tag $REGISTRY/$APP_NAME/deployer $REGISTRY/$APP_NAME/deployer:$MAJOR_VERSION +docker tag $REGISTRY/$APP_NAME/deployer $REGISTRY/$APP_NAME/deployer:$MINOR_VERSION +docker push $REGISTRY/$APP_NAME/deployer:$MAJOR_VERSION +docker push $REGISTRY/$APP_NAME/deployer:$MINOR_VERSION diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml new file mode 100644 index 0000000000..e9f8880a0b --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +appVersion: "2.50" +description: Triton Inference Server +name: triton-inference-server +version: 2.50.0 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/logo.png b/deploy/gke-marketplace-app/server-deployer/chart/triton/logo.png new file mode 100644 index 0000000000..9c70ab77fb Binary files /dev/null and b/deploy/gke-marketplace-app/server-deployer/chart/triton/logo.png differ diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/_helpers.tpl b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/_helpers.tpl new file mode 100644 index 0000000000..cd4ef9264a --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/_helpers.tpl @@ -0,0 +1,60 @@ +{{/* +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/}} + +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "triton-inference-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "triton-inference-server.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "triton-inference-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml new file mode 100644 index 0000000000..28bfbf08c4 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml @@ -0,0 +1,68 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +{{ if and .Values.gcpMarketplace (eq .Values.gcpMarketplace true) }} +--- +apiVersion: app.k8s.io/v1beta1 +kind: Application +metadata: + name: "{{ .Release.Name }}" + annotations: + kubernetes-engine.cloud.google.com/icon: >- + data:image/png;base64,{{ .Files.Get "logo.png" | b64enc }} + marketplace.cloud.google.com/deploy-info: '{"partner_id": "nvidia", "product_id": "triton", "partner_name": "NVIDIA"}' + labels: + app.kubernetes.io/name: "{{ .Release.Name }}" +spec: + descriptor: + type: Triton + version: "{{ .Values.publishedVersion }}" + description: |- + Triton Inference Server provides a cloud and edge inferencing solution + optimized for both CPUs and GPUs. Triton supports an HTTP/REST and GRPC + protocol that allows remote clients to request inferencing for any model + being managed by the server. + + notes: |- + + Send request to Triton server by using IP address "ingress-triton", + send to IP:80/v2/models/{}/infer + + Links: + - [NVIDIA Triton page](https://developer.nvidia.com/nvidia-triton-inference-server) + - [Documentation](https://github.com/triton-inference-server/server) + + selector: + matchLabels: + app.kubernetes.io/name: "{{ .Release.Name }}" + componentKinds: + - group: apps/v1 + kind: Deployment + - group: v1 + kind: Service + - group: autoscaling/v2 + kind: HorizontalPodAutoscaler +{{ end }} diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml new file mode 100644 index 0000000000..75ac1aee81 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml @@ -0,0 +1,93 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ template "triton-inference-server.name" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + replicas: {{ .Values.initReplicaCount }} + selector: + matchLabels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.registry }}/{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + + resources: + limits: + nvidia.com/gpu: {{ .Values.image.numGpus }} + env: + - name: LD_PRELOAD + value: {{ .Values.image.ldPreloadPath }} + args: ["tritonserver", "--model-store={{ .Values.modelRepositoryPath }}", + "--strict-model-config={{ .Values.image.strictModelConfig }}", + "--log-verbose={{ .Values.image.logVerboseLevel }}", + "--allow-gpu-metrics={{ .Values.image.allowGPUMetrics }}"] + + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + livenessProbe: + httpGet: + path: /v2/health/live + port: http + initialDelaySeconds: {{ .Values.deployment.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.deployment.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.deployment.livenessProbe.timeoutSeconds }} + successThreshold: {{ .Values.deployment.livenessProbe.successThreshold }} + failureThreshold: {{ .Values.deployment.livenessProbe.failureThreshold }} + readinessProbe: + httpGet: + path: /v2/health/ready + port: http + initialDelaySeconds: {{ .Values.deployment.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.deployment.readinessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.deployment.readinessProbe.timeoutSeconds }} + successThreshold: {{ .Values.deployment.readinessProbe.successThreshold }} + failureThreshold: {{ .Values.deployment.readinessProbe.failureThreshold }} + + securityContext: + runAsUser: 1000 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/hpa.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/hpa.yaml new file mode 100644 index 0000000000..89275ea7de --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/hpa.yaml @@ -0,0 +1,49 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: triton-hpa + namespace: {{ .Release.Namespace }} + labels: + app: triton-hpa +spec: + minReplicas: {{ .Values.minReplicaCount }} + maxReplicas: {{ .Values.maxReplicaCount }} + metrics: + - type: External + external: + metric: + name: kubernetes.io|container|accelerator|duty_cycle + target: + type: AverageValue + averageValue: {{ .Values.HPATargetAverageValue }} + + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ template "triton-inference-server.name" . }} diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/ingress.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/ingress.yaml new file mode 100644 index 0000000000..2b6da5fe18 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/ingress.yaml @@ -0,0 +1,48 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: triton-external + annotations: + kubernetes.io/ingress.class: "gce" + kubernetes.io/ingress.global-static-ip-name: "ingress-triton" +spec: + rules: + - http: + paths: + - path: "/" + pathType: Prefix + backend: + service: + name: triton-inference-server + port: + {{ if eq .Values.tritonProtocol "gRPC" }} + number: 8001 + {{ else }} + number: 8000 + {{ end }} diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml new file mode 100644 index 0000000000..93ef6f9da3 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml @@ -0,0 +1,55 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server.name" . }} + namespace: {{ .Release.Namespace }} + annotations: + cloud.google.com/neg: '{"ingress": true}' + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 8000 + targetPort: http + name: http-inference-server + - port: 8001 + targetPort: grpc + name: grpc-inference-server + - port: 8002 + targetPort: metrics + name: metrics-inference-server + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + + diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml new file mode 100644 index 0000000000..450d8f735c --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -0,0 +1,66 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +initReplicaCount: 1 +minReplicaCount: 1 +maxReplicaCount: 3 +# choice from gRPC and HTTP +tritonProtocol: HTTP +# HPA GPU utilization autoscaling target +HPATargetAverageValue: 85 +modelRepositoryPath: gs://triton_sample_models/24.09 +publishedVersion: '2.50.0' +gcpMarketplace: true + +image: + registry: gcr.io + repository: nvidia-ngc-public/tritonserver + tag: 24.09-py3 + pullPolicy: IfNotPresent + # modify the model repository here to match your GCP storage bucket + numGpus: 1 + strictModelConfig: False + # add in custom library which could include custom ops in the model + ldPreloadPath: '' + logVerboseLevel: 0 + allowGPUMetrics: True + +service: + type: NodePort + +deployment: + livenessProbe: + failureThreshold: 60 + initialDelaySeconds: 10 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 60 + initialDelaySeconds: 10 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml new file mode 100644 index 0000000000..16494b5261 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -0,0 +1,123 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +x-google-marketplace: + schemaVersion: v2 + applicationApiVersion: v1beta1 + publishedVersion: '2.50.0' + publishedVersionMetadata: + releaseNote: >- + Initial release. + releaseTypes: + - Feature + recommended: true + + clusterConstraints: + k8sVersion: ">=1.18.7" + assistedClusterCreation: + type: DISABLED + creationGuidance: GKE currently doesn't support auto-create GPU clusters, please refer to <a href="https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app">Triton GKE Marketplace Deployer</a> to manually create the GKE cluster >= 1.18.7 and add GPU node pools + resources: + - requests: + gpu: + nvidia.com/gpu: {} + istio: + type: REQUIRED + + images: + '': + properties: + triton.image.registry: + type: REGISTRY + triton.image.repository: + type: REPO_WITHOUT_REGISTRY + triton.image.tag: + type: TAG + +properties: + name: + type: string + x-google-marketplace: + type: NAME + namespace: + type: string + x-google-marketplace: + type: NAMESPACE + initReplicaCount: + title: Initial number of Triton pod instances to deploy. + type: integer + default: 1 + minReplicaCount: + title: Minimum number of Triton pod instances in the deployment for autoscaling. + type: integer + default: 1 + maxReplicaCount: + title: Maximum number of Triton pod instances in the deployment for autoscaling. + type: integer + default: 3 + tritonProtocol: + title: Request protocol to send data to Triton, choose from gRPC and HTTP. + type: string + default: HTTP + HPATargetAverageValue: + title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. + type: integer + default: 85 + modelRepositoryPath: + type: string + title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. + default: gs://triton_sample_models/models + image.ldPreloadPath: + type: string + title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable. + default: '' + image.logVerboseLevel: + type: integer + title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug. + default: 0 + image.strictModelConfig: + type: boolean + title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository. + default: False + image.allowGPUMetrics: + type: boolean + title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG. + default: True + istioEnabled: + type: boolean + x-google-marketplace: + type: ISTIO_ENABLED + default: True + + +required: +- name +- namespace +- modelRepositoryPath + +form: +- widget: help + description: GKE currently doesn't support autocreate GPU clusters, please refer to <a href="https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app">Triton GKE Marketplace Deployer</a> to manually create the GKE cluster >= 1.18.7 and add GPU node pools. Also, please refer to the <a href="https://github.com/triton-inference-server/server">Triton GITHUB page</a> for product information. diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml new file mode 100644 index 0000000000..f3525a52f1 --- /dev/null +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -0,0 +1,123 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +x-google-marketplace: + schemaVersion: v2 + applicationApiVersion: v1beta1 + publishedVersion: '2.50.0' + publishedVersionMetadata: + releaseNote: >- + Initial release. + releaseTypes: + - Feature + recommended: true + + clusterConstraints: + k8sVersion: ">=1.18.7" + assistedClusterCreation: + type: DISABLED + creationGuidance: GKE currently doesn't support auto-create GPU clusters, please refer to <a href="https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app">Triton GKE Marketplace Deployer</a> to manually create the GKE cluster >= 1.18.7 and add GPU node pools + resources: + - requests: + gpu: + nvidia.com/gpu: {} + istio: + type: REQUIRED + + images: + '': + properties: + triton.image.registry: + type: REGISTRY + triton.image.repository: + type: REPO_WITHOUT_REGISTRY + triton.image.tag: + type: TAG + +properties: + name: + type: string + x-google-marketplace: + type: NAME + namespace: + type: string + x-google-marketplace: + type: NAMESPACE + initReplicaCount: + title: Initial number of Triton pod instances to deploy. + type: integer + default: 1 + minReplicaCount: + title: Minimum number of Triton pod instances in the deployment for autoscaling. + type: integer + default: 1 + maxReplicaCount: + title: Maximum number of Triton pod instances in the deployment for autoscaling. + type: integer + default: 3 + tritonProtocol: + title: Request protocol to send data to Triton, choose from gRPC and HTTP. + type: string + default: HTTP + HPATargetAverageValue: + title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. + type: integer + default: 85 + modelRepositoryPath: + type: string + title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. + default: gs://triton_sample_models/24.09 + image.ldPreloadPath: + type: string + title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable. + default: '' + image.logVerboseLevel: + type: integer + title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug. + default: 0 + image.strictModelConfig: + type: boolean + title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository. + default: False + image.allowGPUMetrics: + type: boolean + title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG. + default: True + istioEnabled: + type: boolean + x-google-marketplace: + type: ISTIO_ENABLED + default: True + + +required: +- name +- namespace +- modelRepositoryPath + +form: +- widget: help + description: GKE currently doesn't support autocreate GPU clusters, please refer to <a href="https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app">Triton GKE Marketplace Deployer</a> to manually create the GKE cluster >= 1.18.7 and add GPU node pools. Also, please refer to the <a href="https://github.com/triton-inference-server/server">Triton GITHUB page</a> for product information. diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md new file mode 100644 index 0000000000..0c8012eb68 --- /dev/null +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -0,0 +1,63 @@ +<!-- +# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +# Instruction to create BERT engine for each Triton update + +## Description + +``` +docker run --gpus all -it --network host \ + --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ + -v ~:/scripts nvcr.io/nvidia/tensorrt:24.09-py3 + +pip install onnx six torch tf2onnx tensorflow + +git clone -b main https://github.com/NVIDIA/TensorRT.git +cd TensorRT +git submodule update --init --recursive + +export TRT_OSSPATH=/workspace/TensorRT +export TRT_LIBPATH=/lib/x86_64-linux-gnu + +pushd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && ln -s ngc-cli/ngc ngc && echo "no-apikey\nascii\n" | ngc config set + +popd + +cd /workspace/TensorRT/demo/BERT +bash ./scripts/download_squad.sh +bash ./scripts/download_model.sh large 128 +# bash ./scripts/download_model.sh large 384 + +mkdir -p engines + +python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh + +gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.09/bert/1/model.plan +``` + +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.09/` should be updated accordingly with the correct version. diff --git a/deploy/gke-marketplace-app/ui.png b/deploy/gke-marketplace-app/ui.png new file mode 100644 index 0000000000..7afec326ee Binary files /dev/null and b/deploy/gke-marketplace-app/ui.png differ diff --git a/deploy/k8s-onprem/Chart.yaml b/deploy/k8s-onprem/Chart.yaml new file mode 100644 index 0000000000..92830bc297 --- /dev/null +++ b/deploy/k8s-onprem/Chart.yaml @@ -0,0 +1,44 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v2 +appVersion: "1.0" +description: Triton Inference Server +name: triton-inference-server +version: 1.0.0 +dependencies: + - name: traefik + version: "~10.6.2" + repository: "https://helm.traefik.io/traefik" + tags: + - loadBalancing + - name: prometheus-adapter + version: "~3.0.0" + repository: "https://prometheus-community.github.io/helm-charts" + tags: + - autoscaling + + diff --git a/deploy/k8s-onprem/README.md b/deploy/k8s-onprem/README.md new file mode 100644 index 0000000000..cb641830c9 --- /dev/null +++ b/deploy/k8s-onprem/README.md @@ -0,0 +1,329 @@ +<!-- +# Copyright (c) 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) + +# Kubernetes Deploy: NVIDIA Triton Inference Server Cluster + +This repository includes a Helm chart and instructions for installing NVIDIA Triton +Inference Server in an on-premises or AWS EC2 Kubernetes cluster. You can also use this +repository to enable load balancing and autoscaling for your Triton cluster. + +This guide assumes you already have a functional Kubernetes cluster with support for GPUs. +See the [NVIDIA GPU Operator documentation](https://docs.nvidia.com/datacenter/cloud-native/kubernetes/install-k8s.html) +for instructions on how to install Kubernetes and enable GPU access in your Kubernetes cluster. +You must also have Helm installed (see [Installing Helm](#installing-helm) for instructions). Note the following requirements: + +* To deploy Prometheus and Grafana to collect and display Triton metrics, your cluster must contain sufficient CPU resources to support these services. + +* To use GPUs for inferencing, your cluster must be configured to contain the desired number of GPU nodes, with +support for the NVIDIA driver and CUDA version required by the version +of the inference server you are using. + +* To enable autoscaling, your cluster's kube-apiserver must have the [aggregation layer +enabled](https://kubernetes.io/docs/tasks/extend-kubernetes/configure-aggregation-layer/). +This will allow the horizontal pod autoscaler to read custom metrics from the prometheus adapter. + +This Helm chart is available from [Triton Inference Server +GitHub.](https://github.com/triton-inference-server/server) + +For more information on Helm and Helm charts, visit the [Helm documentation](https://helm.sh/docs/). + +## Quickstart + +First, clone this repository to a local machine. Then, execute the following commands: + +Install helm + +``` +$ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 +$ chmod 700 get_helm.sh +$ ./get_helm.sh +``` + +Deploy Prometheus and Grafana + +``` +$ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +$ helm repo update +$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack +``` + +Deploy Triton with default settings + +``` +helm install example ./deploy/k8s-onprem +``` + + +<!-- The steps below describe how to set-up a model repository, use Helm to +launch the inference server, and then send inference requests to the +running server. You can access a Grafana endpoint to see real-time +metrics reported by the inference server. --> + + +## Installing Helm + +### Helm v3 + +If you do not already have Helm installed in your Kubernetes cluster, +executing the following steps from the [official Helm install +guide](https://helm.sh/docs/intro/install/) will +give you a quick setup. + +If you are currently using Helm v2 and would like to migrate to Helm v3, +see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/). + +## Model Repository +If you already have a model repository, you may use that with this Helm +chart. If you do not have a model repository, you can check out a local +copy of the server source repository to create an example +model repository: + +``` +$ git clone https://github.com/triton-inference-server/server.git +``` + +Triton Server needs a repository of models that it will make available +for inferencing. For this example, we are using an existing NFS server and +placing our model files there. See the +[Model Repository documentation](../../docs/user_guide/model_repository.md) for other +supported locations. + +Following the [QuickStart](../../docs/getting_started/quickstart.md), download the +example model repository to your system and copy it onto your NFS server. +Then, add the url or IP address of your NFS server and the server path of your +model repository to `values.yaml`. + + +## Deploy Prometheus and Grafana + +The inference server metrics are collected by Prometheus and viewable +through Grafana. The inference server Helm chart assumes that Prometheus +and Grafana are available so this step must be followed even if you +do not want to use Grafana. + +Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) Helm chart to install these components. The +*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that +Prometheus can find the inference server metrics in the *example* +release deployed in a later section. + +``` +$ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +$ helm repo update +$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack +``` + +Then port-forward to the Grafana service so you can access it from +your local browser. + +``` +$ kubectl port-forward service/example-metrics-grafana 8080:80 +``` + +Now you should be able to navigate in your browser to localhost:8080 +and see the Grafana login page. Use username=admin and +password=prom-operator to log in. + +An example Grafana dashboard is available in dashboard.json. Use the +import function in Grafana to import and view this dashboard. + +## Enable Autoscaling +To enable autoscaling, ensure that autoscaling tag in `values.yaml`is set to `true`. +This will do two things: + +1. Deploy a Horizontal Pod Autoscaler that will scale replicas of the triton-inference-server +based on the information included in `values.yaml`. + +2. Install the [prometheus-adapter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-adapter) helm chart, allowing the Horizontal Pod Autoscaler to scale +based on custom metrics from prometheus. + +The included configuration will scale Triton pods based on the average queue time, +as described in [this blog post](https://developer.nvidia.com/blog/deploying-nvidia-triton-at-scale-with-mig-and-kubernetes/#:~:text=Query%20NVIDIA%20Triton%20metrics%20using%20Prometheus). To customize this, +you may replace or add to the list of custom rules in `values.yaml`. If you change +the custom metric, be sure to change the values in autoscaling.metrics. + +If autoscaling is disabled, the number of Triton server pods is set to the minReplicas +variable in `values.yaml`. + +## Enable Load Balancing +To enable load balancing, ensure that the loadBalancing tag in `values.yaml` +is set to `true`. This will do two things: + +1. Deploy a Traefik reverse proxy through the [Traefik Helm Chart](https://github.com/traefik/traefik-helm-chart). + +2. Configure two Traefik [IngressRoutes](https://doc.traefik.io/traefik/providers/kubernetes-crd/), +one for http and one for grpc. This will allow the Traefik service to expose two +ports that will be forwarded to and balanced across the Triton pods. + +To choose the port numbers exposed, or to disable either http or grpc, edit the +configured variables in `values.yaml`. + +## Deploy the Inference Server + +Deploy the inference server, autoscaler, and load balancer using the default +configuration with the following commands. + +Here, and in the following commands we use the name `example` for our chart. +This name will be added to the beginning of all resources created during the helm +installation. + +``` +$ cd <directory containing Chart.yaml> +$ helm install example . +``` + +Use kubectl to see status and wait until the inference server pods are +running. + +``` +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s +``` + +There are several ways of overriding the default configuration as +described in this [Helm +documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing). + +You can edit the values.yaml file directly or you can use the *--set* +option to override a single parameter with the CLI. For example, to +deploy a cluster with a minimum of two inference servers use *--set* to +set the autoscaler.minReplicas parameter. + +``` +$ helm install example --set autoscaler.minReplicas=2 . +``` + +You can also write your own "config.yaml" file with the values you +want to override and pass it to Helm. If you specify a "config.yaml" file, the +values set will override those in values.yaml. + +``` +$ cat << EOF > config.yaml +namespace: MyCustomNamespace +image: + imageName: nvcr.io/nvidia/tritonserver:custom-tag + modelRepositoryPath: gs://my_model_repository +EOF +$ helm install example -f config.yaml . +``` + +## Probe Configuration + +In `templates/deployment.yaml` is configurations for `livenessProbe`, `readinessProbe` and `startupProbe` for the Triton server container. +By default, Triton loads all the models before starting the HTTP server to respond to the probes. The process can take several minutes, depending on the models sizes. +If it is not completed in `startupProbe.failureThreshold * startupProbe.periodSeconds` seconds then Kubernetes considers this as a pod failure and restarts it, +ending up with an infinite loop of restarting pods, so make sure to sufficiently set these values for your use case. +The liveliness and readiness probes are being sent only after the first success of a startup probe. + +For more details, see the [Kubernetes probe documentation](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) and the [feature page of the startup probe](https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/950-liveness-probe-holdoff/README.md). + +## Using Triton Inference Server + +Now that the inference server is running you can send HTTP or GRPC +requests to it to perform inferencing. By default, this chart deploys [Traefik](https://traefik.io/) +and uses [IngressRoutes](https://doc.traefik.io/traefik/providers/kubernetes-crd/) +to balance requests across all available nodes. + +To send requests through the Traefik proxy, use the Cluster IP of the +traefik service deployed by the Helm chart. In this case, it is 10.111.128.124. + +``` +$ kubectl get services +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +... +example-traefik LoadBalancer 10.111.128.124 <pending> 8001:31752/TCP,8000:31941/TCP,80:30692/TCP,443:30303/TCP 74m +example-triton-inference-server ClusterIP None <none> 8000/TCP,8001/TCP,8002/TCP 74m +``` + +Use the following command to refer to the Cluster IP: +``` +cluster_ip=`kubectl get svc -l app.kubernetes.io/name=traefik -o=jsonpath='{.items[0].spec.clusterIP}'` +``` + + +The Traefik reverse-proxy exposes an HTTP endpoint on port 8000, and GRPC +endpoint on port 8001 and a Prometheus metrics endpoint on +port 8002. You can use curl to get the meta-data of the inference server +from the HTTP endpoint. + +``` +$ curl $cluster_ip:8000/v2 +``` + +Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example +image classification client that can be used to perform inferencing +using image classification models on the inference +server. For example, + +``` +$ image_client -u $cluster_ip:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg +Request 0, batch size 1 +Image 'images/mug.jpg': + 504 (COFFEE MUG) = 0.723992 + 968 (CUP) = 0.270953 + 967 (ESPRESSO) = 0.00115997 +``` + +## Testing Load Balancing and Autoscaling +After you have confirmed that your Triton cluster is operational and can perform inference, +you can test the load balancing and autoscaling features by sending a heavy load of requests. +One option for doing this is using the +[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) +application. + +You can apply a progressively increasing load with a command like: +``` +perf_analyzer -m simple -u $cluster_ip:8000 --concurrency-range 1:10 +``` + +From your Grafana dashboard, you should be able to see the number of pods increase +as the load increases, with requests being routed evenly to the new pods. + +## Cleanup + +After you have finished using the inference server, you should use Helm to +delete the deployment. + +``` +$ helm list +NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE +example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default +example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default + +$ helm uninstall example +$ helm uninstall example-metrics +``` + +For the Prometheus and Grafana services, you should [explicitly delete +CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart): + +``` +$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com +``` diff --git a/deploy/k8s-onprem/dashboard.json b/deploy/k8s-onprem/dashboard.json new file mode 100644 index 0000000000..9c99a2751c --- /dev/null +++ b/deploy/k8s-onprem/dashboard.json @@ -0,0 +1,1172 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "count(count(nv_inference_count) by (instance))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Active Triton Instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 50, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "example-triton-inference-server-6784d84f5d-v9scn" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 0 + }, + "id": 11, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum by (pod) (rate(nv_inference_count[1m])) / ignoring(pod) group_left sum (rate(nv_inference_count[1m]))", + "instant": false, + "interval": "", + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Proportion of Requests by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum(nv_inference_request_success) by (pod)", + "interval": "", + "legendFormat": "Success {{pod}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum(nv_inference_request_failure) by (pod)", + "interval": "", + "legendFormat": "Failure {{pod}}", + "refId": "B" + } + ], + "title": "Cumulative Inference Requests by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Compute Time (ms)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "sum(rate(nv_inference_compute_infer_duration_us[30s])) by (model) / 1000", + "interval": "", + "legendFormat": "{{model}}", + "refId": "A" + } + ], + "title": "Compute Time by Model (milliseconds)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Queue Time (ms)", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "µs" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "exemplar": true, + "expr": "avg(rate(nv_inference_queue_duration_us[30s])/(1+rate(nv_inference_request_success[30s]))) by (pod)", + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Average Queue Time by Pod (microseconds)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 18, + "x": 0, + "y": 25 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nv_gpu_power_usage", + "interval": "", + "legendFormat": "GPU {{ gpu_uuid }}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 25 + }, + "id": 16, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(nv_gpu_power_usage)", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "GPU Power Total", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nv_gpu_memory_used_bytes", + "interval": "", + "legendFormat": "GPU {{gpu_uuid}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Framebuffer Mem Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nv_gpu_utilization * 100", + "interval": "", + "legendFormat": "GPU {{gpu_uuid}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nv_cpu_memory_used_bytes", + "hide": false, + "instant": false, + "legendFormat": "Memory", + "range": true, + "refId": "A" + } + ], + "title": "Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "nv_cpu_utilization * 100", + "interval": "", + "legendFormat": "CPU", + "range": true, + "refId": "A" + } + ], + "title": "CPU Utilization", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Triton Inference Server", + "uid": "slEY4dsZk", + "version": 5, + "weekStart": "" +} \ No newline at end of file diff --git a/deploy/k8s-onprem/templates/_helpers.tpl b/deploy/k8s-onprem/templates/_helpers.tpl new file mode 100644 index 0000000000..a65331e0f0 --- /dev/null +++ b/deploy/k8s-onprem/templates/_helpers.tpl @@ -0,0 +1,111 @@ +{{/* +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/}} + +# Defines a set of helper functions that produce templated values for other files. +# Mostly for things like names and labels. This file does not produce any +# kubernetes resources by itself + +{{/* vim: set filetype=mustache: */}} +{{/* +Create inference server name. +*/}} +{{- define "triton-inference-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "triton-inference-server.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* + Create inference server metrics service name and fullname derived from above and + truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{/* + Create inference server metrics monitor name and fullname derived from + above and truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics-monitor.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics-monitor.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{/* + Create ingressroute names derived from above and truncated appropriately +*/}} +{{- define "triton-inference-server-ingressroute-http.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 50 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "ingress-http" -}} +{{- end -}} + +{{- define "triton-inference-server-ingressroute-grpc.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 50 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "ingress-grpc" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "triton-inference-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/deploy/k8s-onprem/templates/deployment.yaml b/deploy/k8s-onprem/templates/deployment.yaml new file mode 100644 index 0000000000..8c3a19d136 --- /dev/null +++ b/deploy/k8s-onprem/templates/deployment.yaml @@ -0,0 +1,111 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Creates a deployment for the Triton Inference Server pods +# Each pod contains a Triton container and an nfs mount as specified in +# values.yaml for the model repository + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + replicas: {{ .Values.autoscaling.minReplicas }} + selector: + matchLabels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + + spec: + volumes: + - name: models + nfs: + server: {{ .Values.image.modelRepositoryServer }} + path: {{ .Values.image.modelRepositoryPath }} + readOnly: false + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.imageName }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + volumeMounts: + - mountPath: /models + name: models + + resources: + limits: + nvidia.com/gpu: {{ .Values.image.numGpus }} + + args: + - tritonserver + {{- range .Values.serverArgs }} + - {{ . }} + {{- end }} + + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + livenessProbe: + initialDelaySeconds: 15 + failureThreshold: 3 + periodSeconds: 10 + httpGet: + path: /v2/health/live + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /v2/health/ready + port: http + startupProbe: + # allows Triton to load the models during 30*10 = 300 sec = 5 min + # starts checking the other probes only after the success of this one + # for details, see https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-startup-probes + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /v2/health/ready + port: http + + securityContext: + runAsUser: 1000 + fsGroup: 1000 diff --git a/deploy/k8s-onprem/templates/hpa.yaml b/deploy/k8s-onprem/templates/hpa.yaml new file mode 100644 index 0000000000..4a4afa48d9 --- /dev/null +++ b/deploy/k8s-onprem/templates/hpa.yaml @@ -0,0 +1,52 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Creates the horizontal pod autoscaler for the Triton pod deployment. +# In order to use custom metrics (ie metrics other than CPU usage) with this +# autoscaler, you must have enabled installation of the prometheus adapter. +# This autoscaler (and the prometheus adapter) will only be installed in the +# autoscaling tag is set to true. + +{{- if .Values.tags.autoscaling }} +apiVersion: autoscaling/v2beta2 +kind: HorizontalPodAutoscaler +metadata: + name: triton-hpa + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ template "triton-inference-server.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: {{ toYaml .Values.autoscaling.metrics | nindent 2}} +{{- end -}} diff --git a/deploy/k8s-onprem/templates/ingressroute.yaml b/deploy/k8s-onprem/templates/ingressroute.yaml new file mode 100644 index 0000000000..ee1cbee76f --- /dev/null +++ b/deploy/k8s-onprem/templates/ingressroute.yaml @@ -0,0 +1,69 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Creates the traefik IngressRoutes that allow for external access to the +# triton service. Two routes are created, one for gRPC and one for HTTP. +# Requires deployment of the traefik IngressRoute CRD, along with various roles +# and permissions, most easily accomplished through the referenced traefik +# helm chart. Will only be installed if the loadBalancing tag is set to true. + +{{- if .Values.tags.loadBalancing }} +apiVersion: traefik.containo.us/v1alpha1 +kind: IngressRoute +metadata: + name: {{ template "triton-inference-server-ingressroute-http.name" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + entryPoints: + - triton-http + routes: + - match: PathPrefix(`/`) + kind: Rule + services: + - name: {{ template "triton-inference-server.fullname" . }} + port: 8000 +--- +apiVersion: traefik.containo.us/v1alpha1 +kind: IngressRoute +metadata: + name: {{ template "triton-inference-server-ingressroute-grpc.name" . }} + namespace: {{ .Release.Namespace }} +spec: + entryPoints: + - triton-grpc + routes: + - match: PathPrefix(`/`) + kind: Rule + services: + - name: {{ template "triton-inference-server.fullname" . }} + port: 8001 + scheme: h2c +{{- end -}} diff --git a/deploy/k8s-onprem/templates/service.yaml b/deploy/k8s-onprem/templates/service.yaml new file mode 100644 index 0000000000..6d5bf2cb00 --- /dev/null +++ b/deploy/k8s-onprem/templates/service.yaml @@ -0,0 +1,94 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Defines the services for triton and the triton metrics service. +# Also creates a ServiceMonitor for the triton metrics service. + +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + clusterIP: None + ports: + - port: 8000 + targetPort: http + name: http-inference-server + - port: 8001 + targetPort: grpc + name: grpc-inference-server + - port: 8002 + targetPort: metrics + name: metrics-inference-server + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server-metrics.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" +spec: + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "triton-inference-server-metrics-monitor.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics-monitor.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + selector: + matchLabels: + app: {{ template "triton-inference-server-metrics.name" . }} + endpoints: + - port: metrics + interval: 15s diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml new file mode 100644 index 0000000000..ccee5e9c24 --- /dev/null +++ b/deploy/k8s-onprem/values.yaml @@ -0,0 +1,83 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +tags: + autoscaling: true + loadBalancing: true + +image: + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 + pullPolicy: IfNotPresent + modelRepositoryServer: < Replace with the IP Address of your file server > + modelRepositoryPath: /srv/models + numGpus: 1 + +# add server args here e.g. --grpc-use-ssl, --grpc-server-certs, repository-poll-secs, etc +serverArgs: + - '--model-repository=/models' + +traefik: + ports: + triton-http: + port: 18000 + exposedPort: 8000 + expose: true + protocol: TCP + triton-grpc: + port: 18001 + exposedPort: 8001 + expose: true + protocol: TCP + +autoscaling: + minReplicas: 1 + maxReplicas: 3 + metrics: + - type: Pods + pods: + metric: + name: avg_time_queue_us + target: + type: AverageValue + averageValue: 50 + +prometheus-adapter: + prometheus: + url: http://example-metrics-kube-prome-prometheus.default.svc.cluster.local + port: 9090 + rules: + custom: + - seriesQuery: 'nv_inference_queue_duration_us{namespace="default",pod!=""}' + resources: + overrides: + namespace: + resource: "namespace" + pod: + resource: "pod" + name: + matches: "nv_inference_queue_duration_us" + as: "avg_time_queue_us" + metricsQuery: 'avg(delta(nv_inference_queue_duration_us{<<.LabelMatchers>>}[30s])/(1+delta(nv_inference_request_success{<<.LabelMatchers>>}[30s]))) by (<<.GroupBy>>)' diff --git a/deploy/mlflow-triton-plugin/README.md b/deploy/mlflow-triton-plugin/README.md new file mode 100644 index 0000000000..c011194299 --- /dev/null +++ b/deploy/mlflow-triton-plugin/README.md @@ -0,0 +1,255 @@ +<!-- +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> +# MLflow Triton + +MLflow plugin for deploying your models from MLflow to Triton Inference Server. +Scripts are included for publishing models, which are in Triton recognized +structure, to your MLflow Model Registry. + +### Supported flavors + +MLFlow Triton plugin currently supports the following flavors, you may +substitute the flavor specification in the example below according to the model +to be deployed. + +* onnx +* triton + +## Requirements + +* MLflow +* Triton Python HTTP client +* Triton Inference Server + +## Installation + +The plugin can be installed from source using the following commands + +``` +python setup.py install +``` + +## Quick Start + +In this documentation, we will use the files in `examples` to showcase how +the plugin interacts with Triton Inference Server. The `onnx_float32_int32_int32` +model in `examples` is a simple model that takes two float32 inputs, INPUT0 and +INPUT1, with shape [-1, 16], and produces two int32 outputs, OUTPUT0 and +OUTPUT1, where OUTPUT0 is the element-wise summation of INPUT0 and INPUT1 and +OUTPUT1 is the element-wise subtraction of INPUT0 and INPUT1. + +### Start Triton Inference Server in EXPLICIT mode + +The MLflow Triton plugin must work with a running Triton server, see +[documentation](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/quickstart.md) +of Triton Inference Server for how to start the server. Note that +the server should be run in EXPLICIT mode (`--model-control-mode=explicit`) +to exploit the deployment feature of the plugin. + +Once the server has started, the following environment must be set so that the plugin +can interact with the server properly: +* `TRITON_URL`: The address to the Triton HTTP endpoint +* `TRITON_MODEL_REPO`: The path to the Triton model repository. It can be an s3 URI but keep in \ +mind that the env vars AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are needed. + +### Publish models to MLflow + +#### ONNX flavor + +The MLFlow ONNX built-in functionalities can be used to publish `onnx` flavor +models to MLFlow directly, and the MLFlow Triton plugin will prepare the model +to the format expected by Triton. You may also log +[`config.pbtxt`](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_configuration.md) +as additional artifact which Triton will be used to serve the model. Otherwise, +the server should be run with auto-complete feature enabled +(`--strict-model-config=false`) to generate the model configuration. + +``` +import mlflow.onnx +import onnx +model = onnx.load("examples/onnx_float32_int32_int32/1/model.onnx") +mlflow.onnx.log_model(model, "triton", registered_model_name="onnx_float32_int32_int32") +``` + +#### Triton flavor + +For other model frameworks that Triton supports but not yet recognized by +the MLFlow Triton plugin, the `publish_model_to_mlflow.py` script can be used to +publish `triton` flavor models to MLflow. A `triton` flavor model is a directory +containing the model files following the +[model layout](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md#repository-layout). +Below is an example usage: + +``` +cd /scripts + +python publish_model_to_mlflow.py --model_name onnx_float32_int32_int32 --model_directory <path-to-the-examples-directory>/onnx_float32_int32_int32 --flavor triton +``` + +### Deploy models tracked in MLflow to Triton + +Once a model is published and tracked in MLflow, it can be deployed to Triton +via MLflow's deployments command, the following command will download the model +to Triton's model repository and request Triton to load the model. + +``` +mlflow deployments create -t triton --flavor triton --name onnx_float32_int32_int32 -m models:/onnx_float32_int32_int32/1 +``` + +### Perform inference + +After the model is deployed, the following command is the CLI usage to send +inference request to a deployment. + +``` +mlflow deployments predict -t triton --name onnx_float32_int32_int32 --input-path <path-to-the-examples-directory>/input.json --output-path output.json +``` + +The inference result will be written in `output.json` and you may compare it +with the results in `expected_output.json` + +## MLflow Deployments + +"MLflow Deployments" is a set of MLflow APIs for deploying MLflow models to +custom serving tools. The MLflow Triton plugin implements the following +deployment functions to support the interaction with Triton server in MLflow. + +### Create Deployment + +MLflow deployments create API deploys a model to the Triton target, which will +download the model to Triton's model repository and request Triton to load the +model. + +To create a MLflow deployment using CLI: + +``` +mlflow deployments create -t triton --flavor triton --name model_name -m models:/model_name/1 +``` + +To create a MLflow deployment using Python API: + +``` +from mlflow.deployments import get_deploy_client +client = get_deploy_client('triton') +client.create_deployment("model_name", "models:/model_name/1", flavor="triton") +``` + +### Delete Deployment + +MLflow deployments delete API removes an existing deployment from the Triton +target, which will remove the model in Triton's model repository and request +Triton to unload the model. + +To delete a MLflow deployment using CLI + +``` +mlflow deployments delete -t triton --name model_name +``` + +To delete a MLflow deployment using CLI + +``` +from mlflow.deployments import get_deploy_client +client = get_deploy_client('triton') +client.delete_deployment("model_name") +``` + +### Update Deployment + +MLflow deployments update API updates an existing deployment with another model +(version) tracked in MLflow, which will overwrite the model in Triton's model +repository and request Triton to reload the model. + +To update a MLflow deployment using CLI + +``` +mlflow deployments update -t triton --flavor triton --name model_name -m models:/model_name/2 +``` + +To update a MLflow deployment using Python API + +``` +from mlflow.deployments import get_deploy_client +client = get_deploy_client('triton') +client.update_deployment("model_name", "models:/model_name/2", flavor="triton") +``` + +### List Deployments + +MLflow deployments list API lists all existing deployments in Triton target. + +To list all MLflow deployments using CLI + +``` +mlflow deployments list -t triton +``` + +To list all MLflow deployments using Python API + +``` +from mlflow.deployments import get_deploy_client +client = get_deploy_client('triton') +client.list_deployments() +``` + +### Get Deployment + +MLflow deployments get API returns information regarding a specific deployments +in Triton target. + +To list a specific MLflow deployment using CLI +``` +mlflow deployments get -t triton --name model_name +``` + +To list a specific MLflow deployment using Python API +``` +from mlflow.deployments import get_deploy_client +client = get_deploy_client('triton') +client.get_deployment("model_name") +``` + +### Run Inference on Deployments + +MLflow deployments predict API runs inference by preparing and sending the +request to Triton and returns the Triton response. + +To run inference using CLI + +``` +mlflow deployments predict -t triton --name model_name --input-path input_file --output-path output_file + +``` + +To run inference using Python API + +``` +from mlflow.deployments import get_deploy_client +client = get_deploy_client('triton') +client.predict("model_name", inputs) +``` diff --git a/deploy/mlflow-triton-plugin/examples/expected_output.json b/deploy/mlflow-triton-plugin/examples/expected_output.json new file mode 100644 index 0000000000..320f8f4815 --- /dev/null +++ b/deploy/mlflow-triton-plugin/examples/expected_output.json @@ -0,0 +1,6 @@ +{"outputs": + { + "OUTPUT0": [[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]], + "OUTPUT1": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] + } +} \ No newline at end of file diff --git a/deploy/mlflow-triton-plugin/examples/input.json b/deploy/mlflow-triton-plugin/examples/input.json new file mode 100644 index 0000000000..418396ccf0 --- /dev/null +++ b/deploy/mlflow-triton-plugin/examples/input.json @@ -0,0 +1,6 @@ +{"inputs": + { + "INPUT0": [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]], + "INPUT1": [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]] + } +} \ No newline at end of file diff --git a/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx new file mode 100755 index 0000000000..f12d500597 Binary files /dev/null and b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx differ diff --git a/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt new file mode 100644 index 0000000000..75ea016cfa --- /dev/null +++ b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt @@ -0,0 +1,57 @@ + +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +platform: "onnxruntime_onnx" +max_batch_size: 8 +version_policy: { latest { num_versions: 1 }} +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] \ No newline at end of file diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py new file mode 100755 index 0000000000..0b73b537d4 --- /dev/null +++ b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/config.py b/deploy/mlflow-triton-plugin/mlflow_triton/config.py new file mode 100755 index 0000000000..0a381fd407 --- /dev/null +++ b/deploy/mlflow-triton-plugin/mlflow_triton/config.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import re +from collections import namedtuple + +from mlflow.exceptions import MlflowException + + +class Config(dict): + def __init__(self): + super().__init__() + self["triton_url"] = os.environ.get("TRITON_URL") + self["triton_model_repo"] = os.environ.get("TRITON_MODEL_REPO") + + if self["triton_model_repo"].startswith("s3://"): + self.s3_regex = re.compile( + "s3://(http://|https://|)([0-9a-zA-Z\\-.]+):([0-9]+)/" + "([0-9a-z.\\-]+)(((/[0-9a-zA-Z.\\-_]+)*)?)" + ) + + uri = self.parse_path(self["triton_model_repo"]) + if uri.protocol == "https://": + protocol = "https://" + else: + protocol = "http://" + endpoint_url = None + if uri.host_name != "" and uri.host_port != "": + endpoint_url = "{}{}:{}".format(protocol, uri.host_name, uri.host_port) + + import boto3 + + # boto3 handles AWS credentials + self["s3"] = boto3.client("s3", endpoint_url=endpoint_url) + self["s3_bucket"] = uri.bucket + self["s3_prefix"] = uri.prefix + self["triton_model_repo"] = "s3://{}".format( + os.path.join(uri.bucket, uri.prefix) + ) + + def parse_path(self, path): + # Cleanup extra slashes + clean_path = self.clean_path(path) + + # Get the bucket name and the object path. Return error if path is malformed + match = self.s3_regex.fullmatch(clean_path) + S3URI = namedtuple( + "S3URI", ["protocol", "host_name", "host_port", "bucket", "prefix"] + ) + if match: + uri = S3URI(*match.group(1, 2, 3, 4, 5)) + if uri.prefix and uri.prefix[0] == "/": + uri = uri._replace(prefix=uri.prefix[1:]) + else: + bucket_start = clean_path.find("s3://") + len("s3://") + bucket_end = clean_path.find("/", bucket_start) + + # If there isn't a slash, the address has only the bucket + if bucket_end > bucket_start: + bucket = clean_path[bucket_start:bucket_end] + prefix = clean_path[bucket_end + 1 :] + else: + bucket = clean_path[bucket_start:] + prefix = "" + uri = S3URI("", "", "", bucket, prefix) + + if uri.bucket == "": + raise MlflowException("No bucket name found in path: " + path) + + return uri + + def clean_path(self, s3_path): + # Must handle paths with s3 prefix + start = s3_path.find("s3://") + path = "" + if start != -1: + path = s3_path[start + len("s3://") :] + clean_path = "s3://" + else: + path = s3_path + clean_path = "" + + # Must handle paths with https:// or http:// prefix + https_start = path.find("https://") + if https_start != -1: + path = path[https_start + len("https://") :] + clean_path += "https://" + else: + http_start = path.find("http://") + if http_start != -1: + path = path[http_start + len("http://") :] + clean_path += "http://" + + # Remove trailing slashes + rtrim_length = len(path.rstrip("/")) + if rtrim_length == 0: + raise MlflowException("Invalid bucket name: '" + path + "'") + + # Remove leading slashes + ltrim_length = len(path) - len(path.lstrip("/")) + if ltrim_length == len(path): + raise MlflowException("Invalid bucket name: '" + path + "'") + + # Remove extra internal slashes + true_path = path[ltrim_length : rtrim_length + 1] + previous_slash = False + for i in range(len(true_path)): + if true_path[i] == "/": + if not previous_slash: + clean_path += true_path[i] + previous_slash = True + else: + clean_path += true_path[i] + previous_slash = False + + return clean_path diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py new file mode 100755 index 0000000000..bebe559b9e --- /dev/null +++ b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py @@ -0,0 +1,540 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import ast +import glob +import json +import logging +import os +import shutil +from pathlib import Path + +import numpy as np +import pandas as pd +import tritonclient.http as tritonhttpclient +from mlflow.deployments import BaseDeploymentClient +from mlflow.exceptions import MlflowException +from mlflow.models import Model +from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow_triton.config import Config +from tritonclient.utils import ( + InferenceServerException, + np_to_triton_dtype, + triton_to_np_dtype, +) + +logger = logging.getLogger(__name__) + +_MLFLOW_META_FILENAME = "mlflow-meta.json" + + +class TritonPlugin(BaseDeploymentClient): + def __init__(self, uri): + """ + Initializes the deployment plugin, sets the triton model repo + """ + super(TritonPlugin, self).__init__(target_uri=uri) + self.server_config = Config() + triton_url, self.triton_model_repo = self._get_triton_server_config() + # need to add other flavors + self.supported_flavors = ["triton", "onnx"] + # URL cleaning for constructing Triton client + ssl = False + if triton_url.startswith("http://"): + triton_url = triton_url[len("http://") :] + elif triton_url.startswith("https://"): + triton_url = triton_url[len("https://") :] + ssl = True + self.triton_client = tritonhttpclient.InferenceServerClient( + url=triton_url, ssl=ssl + ) + + def _get_triton_server_config(self): + triton_url = "localhost:8000" + if self.server_config["triton_url"]: + triton_url = self.server_config["triton_url"] + logger.info("Triton url = {}".format(triton_url)) + + if not self.server_config["triton_model_repo"]: + raise Exception("Check that environment variable TRITON_MODEL_REPO is set") + triton_model_repo = self.server_config["triton_model_repo"] + logger.info("Triton model repo = {}".format(triton_model_repo)) + + return triton_url, triton_model_repo + + def create_deployment(self, name, model_uri, flavor=None, config=None): + """ + Deploy the model at the model_uri to the Triton model repo. Associated config.pbtxt and *labels* files will be deployed. + + :param name: Name of the of the model + :param model_uri: Model uri in format model:/<model-name>/<version-or-stage> + :param flavor: Flavor of the deployed model + :param config: Configuration parameters + + :return: Model flavor and name + """ + self._validate_flavor(flavor) + + # Verify model does not already exist in Triton + if self._model_exists(name): + raise Exception( + "Unable to create deployment for name %s because it already exists." + % (name) + ) + + # Get the path of the artifact + path = Path(_download_artifact_from_uri(model_uri)) + self._copy_files_to_triton_repo(path, name, flavor) + self._generate_mlflow_meta_file(name, flavor, model_uri) + + try: + self.triton_client.load_model(name) + except InferenceServerException as ex: + raise MlflowException(str(ex)) + + return {"name": name, "flavor": flavor} + + def delete_deployment(self, name): + """ + Delete the deployed model in Triton with the provided model name + + :param name: Name of the of the model with version number. For ex: "densenet_onnx/2" + + :return: None + """ + # Verify model is already deployed to Triton + if not self._model_exists(name): + raise Exception( + "Unable to delete deployment for name %s because it does not exist." + % (name) + ) + + try: + self.triton_client.unload_model(name) + except InferenceServerException as ex: + raise MlflowException(str(ex)) + + self._delete_deployment_files(name) + + return None + + def update_deployment(self, name, model_uri=None, flavor=None, config=None): + """ + Update the model deployment in triton with the provided name + + :param name: Name and version number of the model, <model_name>/<version>. + :param model_uri: Model uri models:/model_name/version + :param flavor: The flavor of the model + :param config: Configuration parameters + + :return: Returns the flavor of the model + """ + # TODO: Update this function with a warning. If config and label files associated with this + # updated model are different than the ones already deployed to triton, issue a warning to the user. + self._validate_flavor(flavor) + + # Verify model is already deployed to Triton + if not self._model_exists(name): + raise Exception( + "Unable to update deployment for name %s because it does not exist." + % (name) + ) + + self.get_deployment(name) + + # Get the path of the artifact + path = Path(_download_artifact_from_uri(model_uri)) + + self._copy_files_to_triton_repo(path, name, flavor) + + self._generate_mlflow_meta_file(name, flavor, model_uri) + + try: + self.triton_client.load_model(name) + except InferenceServerException as ex: + raise MlflowException(str(ex)) + + return {"flavor": flavor} + + def list_deployments(self): + """ + List models deployed to Triton. + + :return: None + """ + resp = self.triton_client.get_model_repository_index() + actives = [] + for d in resp: + if "state" in d and d["state"] == "READY": + mlflow_meta_path = os.path.join( + self.triton_model_repo, d["name"], _MLFLOW_META_FILENAME + ) + if "s3" in self.server_config: + meta_dict = ast.literal_eval( + self.server_config["s3"] + .get_object( + Bucket=self.server_config["s3_bucket"], + Key=os.path.join( + self.server_config["s3_prefix"], + d["name"], + _MLFLOW_META_FILENAME, + ), + )["Body"] + .read() + .decode("utf-8") + ) + elif os.path.isfile(mlflow_meta_path): + meta_dict = self._get_mlflow_meta_dict(d["name"]) + else: + continue + + d["triton_model_path"] = meta_dict["triton_model_path"] + d["mlflow_model_uri"] = meta_dict["mlflow_model_uri"] + d["flavor"] = meta_dict["flavor"] + actives.append(d) + + return actives + + def get_deployment(self, name): + """ + Get deployment from Triton. + + :param name: Name of the model. \n + Ex: "mini_bert_onnx" - gets the details of active version of this model \n + + :return: output - Returns a dict with model info + """ + deployments = self.list_deployments() + for d in deployments: + if d["name"] == name: + return d + raise ValueError(f"Unable to get deployment with name {name}") + + def predict(self, deployment_name, df): + single_input_np = None + if isinstance(df, np.ndarray): + single_input_np = df + + inputs = [] + if single_input_np is not None: + raise MlflowException("Unnamed input is not currently supported") + else: + if isinstance(df, pd.DataFrame): + model_metadata = self.triton_client.get_model_metadata(deployment_name) + input_dtype = {} + for input in model_metadata["inputs"]: + input_dtype[input["name"]] = triton_to_np_dtype(input["datatype"]) + # Sanity check + if len(df.columns) != 1: + raise MlflowException("Expect Pandas DataFrame has only 1 column") + col = df.columns[0] + for row in df.index: + val = df[col][row] + # Need to form numpy array of the data type expected + if type(df[col][row]) != np.ndarray: + val = np.array(val, dtype=input_dtype[row]) + inputs.append( + tritonhttpclient.InferInput( + row, val.shape, np_to_triton_dtype(val.dtype) + ) + ) + inputs[-1].set_data_from_numpy(val) + else: + for key, val in df.items(): + inputs.append( + tritonhttpclient.InferInput( + key, val.shape, np_to_triton_dtype(val.dtype) + ) + ) + inputs[-1].set_data_from_numpy(val) + + try: + resp = self.triton_client.infer(model_name=deployment_name, inputs=inputs) + res = {} + for output in resp.get_response()["outputs"]: + res[output["name"]] = resp.as_numpy(output["name"]) + return pd.DataFrame.from_dict({"outputs": res}) + except InferenceServerException as ex: + raise MlflowException(str(ex)) + + def _generate_mlflow_meta_file(self, name, flavor, model_uri): + triton_deployment_dir = os.path.join(self.triton_model_repo, name) + meta_dict = { + "name": name, + "triton_model_path": triton_deployment_dir, + "mlflow_model_uri": model_uri, + "flavor": flavor, + } + + if "s3" in self.server_config: + self.server_config["s3"].put_object( + Body=json.dumps(meta_dict, indent=4).encode("utf-8"), + Bucket=self.server_config["s3_bucket"], + Key=os.path.join( + self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME + ), + ) + else: + with open( + os.path.join(triton_deployment_dir, _MLFLOW_META_FILENAME), "w" + ) as outfile: + json.dump(meta_dict, outfile, indent=4) + + print("Saved", _MLFLOW_META_FILENAME, "to", triton_deployment_dir) + + def _get_mlflow_meta_dict(self, name): + mlflow_meta_path = os.path.join( + self.triton_model_repo, name, _MLFLOW_META_FILENAME + ) + + if "s3" in self.server_config: + mlflow_meta_dict = ast.literal_eval( + self.server_config["s3"] + .get_object( + Bucket=self.server_config["s3_bucket"], + Key=os.path.join( + self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME + ), + )["Body"] + .read() + .decode("utf-8") + ) + else: + with open(mlflow_meta_path, "r") as metafile: + mlflow_meta_dict = json.load(metafile) + + return mlflow_meta_dict + + def _get_copy_paths(self, artifact_path, name, flavor): + copy_paths = {} + copy_paths["model_path"] = {} + triton_deployment_dir = os.path.join(self.triton_model_repo, name) + if flavor == "triton": + # When flavor is 'triton', the model is assumed to be preconfigured + # with proper model versions and version strategy, which may differ from + # the versioning in MLFlow + for file in artifact_path.iterdir(): + if file.is_dir(): + copy_paths["model_path"]["from"] = file + break + copy_paths["model_path"]["to"] = triton_deployment_dir + elif flavor == "onnx": + # Look for model file via MLModel metadata or iterating dir + model_file = None + config_file = None + for file in artifact_path.iterdir(): + if file.name == "MLmodel": + mlmodel = Model.load(file) + onnx_meta_data = mlmodel.flavors.get("onnx", None) + if onnx_meta_data is not None: + model_file = onnx_meta_data.get("data", None) + elif file.name == "config.pbtxt": + config_file = file.name + copy_paths["config_path"] = {} + elif file.suffix == ".txt" and file.stem != "requirements": + copy_paths[file.stem] = {"from": file, "to": triton_deployment_dir} + if model_file is None: + for file in artifact_path.iterdir(): + if file.suffix == ".onnx": + model_file = file.name + break + copy_paths["model_path"]["from"] = os.path.join(artifact_path, model_file) + copy_paths["model_path"]["to"] = os.path.join(triton_deployment_dir, "1") + + if config_file is not None: + copy_paths["config_path"]["from"] = os.path.join( + artifact_path, config_file + ) + copy_paths["config_path"]["to"] = triton_deployment_dir + else: + # Make sure the directory has been created for config.pbtxt + os.makedirs(triton_deployment_dir, exist_ok=True) + # Provide a minimum config file so Triton knows what backend + # should be performing the auto-completion + config = """ +backend: "onnxruntime" +default_model_filename: "{}" +""".format( + model_file + ) + with open( + os.path.join(triton_deployment_dir, "config.pbtxt"), "w" + ) as cfile: + cfile.write(config) + return copy_paths + + def _walk(self, path): + """Walk a path like os.walk() if path is dir, + return file in the expected format otherwise. + :param path: dir or file path + + :return: root, dirs, files + """ + if os.path.isfile(path): + return [(os.path.dirname(path), [], [os.path.basename(path)])] + elif os.path.isdir(path): + return list(os.walk(path)) + else: + raise Exception(f"path: {path} is not a valid path to a file or dir.") + + def _copy_files_to_triton_repo(self, artifact_path, name, flavor): + copy_paths = self._get_copy_paths(artifact_path, name, flavor) + for key in copy_paths: + if "s3" in self.server_config: + # copy model dir to s3 recursively + for root, dirs, files in self._walk(copy_paths[key]["from"]): + for filename in files: + local_path = os.path.join(root, filename) + + if flavor == "onnx": + s3_path = os.path.join( + self.server_config["s3_prefix"], + copy_paths[key]["to"] + .replace(self.server_config["triton_model_repo"], "") + .strip("/"), + filename, + ) + + elif flavor == "triton": + rel_path = os.path.relpath( + local_path, + copy_paths[key]["from"], + ) + s3_path = os.path.join( + self.server_config["s3_prefix"], name, rel_path + ) + + self.server_config["s3"].upload_file( + local_path, + self.server_config["s3_bucket"], + s3_path, + ) + else: + if os.path.isdir(copy_paths[key]["from"]): + if os.path.isdir(copy_paths[key]["to"]): + shutil.rmtree(copy_paths[key]["to"]) + shutil.copytree(copy_paths[key]["from"], copy_paths[key]["to"]) + else: + if not os.path.isdir(copy_paths[key]["to"]): + os.makedirs(copy_paths[key]["to"]) + shutil.copy(copy_paths[key]["from"], copy_paths[key]["to"]) + + if "s3" not in self.server_config: + triton_deployment_dir = os.path.join(self.triton_model_repo, name) + version_folder = os.path.join(triton_deployment_dir, "1") + os.makedirs(version_folder, exist_ok=True) + + return copy_paths + + def _delete_mlflow_meta(self, filepath): + if "s3" in self.server_config: + self.server_config["s3"].delete_object( + Bucket=self.server_config["s3_bucket"], + Key=filepath, + ) + elif os.path.isfile(filepath): + os.remove(filepath) + + def _delete_deployment_files(self, name): + triton_deployment_dir = os.path.join(self.triton_model_repo, name) + + if "s3" in self.server_config: + objs = self.server_config["s3"].list_objects( + Bucket=self.server_config["s3_bucket"], + Prefix=os.path.join(self.server_config["s3_prefix"], name), + ) + + for key in objs["Contents"]: + key = key["Key"] + try: + self.server_config["s3"].delete_object( + Bucket=self.server_config["s3_bucket"], + Key=key, + ) + except Exception as e: + raise Exception(f"Could not delete {key}: {e}") + + else: + # Check if the deployment directory exists + if not os.path.isdir(triton_deployment_dir): + raise Exception( + "A deployment does not exist for this model in directory {} for model name {}".format( + triton_deployment_dir, name + ) + ) + + model_file = glob.glob("{}/model*".format(triton_deployment_dir)) + for file in model_file: + print("Model directory found: {}".format(file)) + os.remove(file) + print("Model directory removed: {}".format(file)) + + # Delete mlflow meta file + mlflow_meta_path = os.path.join( + self.triton_model_repo, name, _MLFLOW_META_FILENAME + ) + self._delete_mlflow_meta(mlflow_meta_path) + + def _validate_config_args(self, config): + if not config["version"]: + raise Exception("Please provide the version as a config argument") + if not config["version"].isdigit(): + raise ValueError( + "Please make sure version is a number. version = {}".format( + config["version"] + ) + ) + + def _validate_flavor(self, flavor): + if flavor not in self.supported_flavors: + raise Exception("{} model flavor not supported by Triton".format(flavor)) + + def _model_exists(self, name): + deploys = self.list_deployments() + exists = False + for d in deploys: + if d["name"] == name: + exists = True + return exists + + +def run_local(name, model_uri, flavor=None, config=None): + raise NotImplementedError("run_local has not been implemented yet") + + +def target_help(): + help_msg = ( + "\nmlflow-triton plugin integrates the Triton Inference Server to the mlflow deployment pipeline. \n\n " + "Example command: \n\n" + ' mlflow deployments create -t triton --name mymodel --flavor onnx -m models:/mymodel/Production -C "version=1" \n\n' + "The environment variable TRITON_MODEL_REPO must be set to the location that the Triton" + "Inference Server is storing its models\n\n" + "export TRITON_MODEL_REPO = /path/to/triton/model/repo\n\n" + "Use the following config options:\n\n" + "- version: The version of the model to be released. This config will be used by Triton to create a new model sub-directory.\n" + ) + return help_msg diff --git a/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py new file mode 100755 index 0000000000..779d393020 --- /dev/null +++ b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os + +import click +import mlflow +import triton_flavor + + +@click.command() +@click.option( + "--model_name", + help="Model name", +) +@click.option( + "--model_directory", + type=click.Path(exists=True, readable=True), + required=True, + help="Model filepath", +) +@click.option( + "--flavor", + type=click.Choice(["triton"], case_sensitive=True), + required=True, + help="Model flavor", +) +def publish_to_mlflow(model_name, model_directory, flavor): + mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"] + artifact_path = "triton" + + mlflow.set_tracking_uri(uri=mlflow_tracking_uri) + + with mlflow.start_run() as run: + if flavor == "triton": + triton_flavor.log_model( + model_directory, + artifact_path=artifact_path, + registered_model_name=model_name, + ) + else: + # Enhancement, for model in other flavor (framework) that Triton + # supports, try to format it in Triton style and provide + # config.pbtxt file. Should this be done in the plugin? + raise Exception("Other flavor is not supported") + + print(mlflow.get_artifact_uri()) + + +if __name__ == "__main__": + publish_to_mlflow() diff --git a/deploy/mlflow-triton-plugin/scripts/triton_flavor.py b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py new file mode 100755 index 0000000000..7b0f61630d --- /dev/null +++ b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" +The ``triton`` module provides APIs for logging and loading Triton-recognized +models in the MLflow Model format. This module exports MLflow Models with the following +flavors: + +Triton format + model files in the structure that Triton can load the model from. + +""" +import os +import shutil +import sys + +from mlflow.exceptions import MlflowException +from mlflow.models import Model +from mlflow.models.model import MLMODEL_FILE_NAME +from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS +from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS +from mlflow.utils.annotations import experimental + +FLAVOR_NAME = "triton" + + +@experimental +def save_model( + triton_model_path, + path, + mlflow_model=None, +): + """ + Save an Triton model to a path on the local file system. + + :param triton_model_path: File path to Triton model to be saved. + :param path: Local path where the model is to be saved. + :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. + + """ + + path = os.path.abspath(path) + if os.path.exists(path): + raise MlflowException( + message="Path '{}' already exists".format(path), + error_code=RESOURCE_ALREADY_EXISTS, + ) + os.makedirs(path) + triton_model_path = os.path.normpath(triton_model_path) + model_data_subpath = os.path.basename(triton_model_path) + model_data_path = os.path.join(path, model_data_subpath) + + # Save Triton model + shutil.copytree(triton_model_path, model_data_path) + + mlflow_model.add_flavor(FLAVOR_NAME, data=model_data_subpath) + mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) + + +@experimental +def log_model( + triton_model_path, + artifact_path, + registered_model_name=None, + await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS, +): + """ + Log an Triton model as an MLflow artifact for the current run. + + :param triton_model_path: File path to Triton model. + :param artifact_path: Run-relative artifact path. + :param registered_model_name: (Experimental) If given, create a model version under + ``registered_model_name``, also creating a registered model if one + with the given name does not exist. + + :param await_registration_for: Number of seconds to wait for the model version to finish + being created and is in ``READY`` status. By default, the function + waits for five minutes. Specify 0 or None to skip waiting. + + """ + Model.log( + artifact_path=artifact_path, + flavor=sys.modules[__name__], + triton_model_path=triton_model_path, + registered_model_name=registered_model_name, + await_registration_for=await_registration_for, + ) diff --git a/deploy/mlflow-triton-plugin/setup.py b/deploy/mlflow-triton-plugin/setup.py new file mode 100755 index 0000000000..65b8e0df1e --- /dev/null +++ b/deploy/mlflow-triton-plugin/setup.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from setuptools import find_packages, setup + +setup( + name="mlflow-triton", + version="0.2.0", + description="Triton Mlflow Deployment", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + packages=find_packages(), + install_requires=["mlflow>=2.2.1,<3.0", "tritonclient[all]", "boto3"], + entry_points={"mlflow.deployments": "triton=mlflow_triton.deployments"}, +) diff --git a/deploy/oci/Chart.yaml b/deploy/oci/Chart.yaml new file mode 100644 index 0000000000..2b7541bee6 --- /dev/null +++ b/deploy/oci/Chart.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +appVersion: "1.0" +description: Triton Inference Server +name: triton-inference-server +version: 1.0.0 diff --git a/deploy/oci/README.md b/deploy/oci/README.md new file mode 100644 index 0000000000..dc293c7378 --- /dev/null +++ b/deploy/oci/README.md @@ -0,0 +1,306 @@ +<!-- +# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) + +# Kubernetes Deploy: Triton Inference Server Cluster + +A helm chart for installing a single cluster of Triton Inference +Server is provided. By default the cluster contains a single instance +of the inference server but the *replicaCount* configuration parameter +can be set to create a cluster of any size, as described below. + +This guide assumes you already have a functional Kubernetes cluster +and helm installed (see below for instructions on installing +helm). Note the following requirements: + +* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prometheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resources to support these services. + +* If you want Triton Server to use GPUs for inferencing, your cluster +must be configured to contain the desired number of GPU nodes (A10 GPU instances recommended) +with support for the NVIDIA driver and CUDA version required by the version +of the inference server you are using. + +The steps below describe how to set-up a model repository, use helm to +launch the inference server, and then send inference requests to the +running server. You can access a Grafana endpoint to see real-time +metrics reported by the inference server. + +## Notes for OKE cluster + +When creating your node pool, the default value for the boot volume is 46.6GB. +Due to the size of the server container, it is recommended to increase this value +to 150GB and set a [cloud-init script to increase the partition](https://blogs.oracle.com/ateam/post/oke-node-sizing-for-very-large-container-images): + +``` +#!/bin/bash +curl --fail -H "Authorization: Bearer Oracle" -L0 http://169.254.169.254/opc/v2/instance/metadata/oke_init_script | base64 --decode >/var/run/oke-init.sh +bash /var/run/oke-init.sh +sudo /usr/libexec/oci-growfs -y +``` + + +## Installing Helm + +### Using Cloud Shell from OCI Web Console + +It is possible to access your OKE Cluster [directly from the OCI Web Console](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengaccessingclusterkubectl.htm). +Helm v3 is already available from the Cloud Shell. + +### Helm v3 + +If you do not already have Helm installed in your Kubernetes cluster, +executing the following steps from the [official helm install +guide](https://helm.sh/docs/intro/install/) will +give you a quick setup. + +If you're currently using Helm v2 and would like to migrate to Helm v3, +please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/). + +### Helm v2 + +> **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3. + +Below are example instructions for installing Helm v2. + +``` +$ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash +$ kubectl create serviceaccount -n kube-system tiller +serviceaccount/tiller created +$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller +$ helm init --service-account tiller --wait +``` + +If you run into any issues, you can refer to the official installation guide [here](https://v2.helm.sh/docs/install/). + +## Model Repository + +If you already have a model repository you may use that with this helm +chart. If you do not have a model repository, you can checkout a local +copy of the inference server source repository to create an example +model repository: + +``` +$ git clone https://github.com/triton-inference-server/server.git +``` + +Triton Server needs a repository of models that it will make available +for inferencing. For this example you will place the model repository +in an S3 compatible OCI Object Storage Bucket. + +``` +$ oci os bucket create --compartment-id <COMPARTMENT_OCID> --name triton-inference-server-repository +``` + +Following the [QuickStart](../../docs/getting_started/quickstart.md) download the +example model repository to your system and copy it into the OCI +Bucket. + +``` +$ oci os object bulk-upload -bn triton-inference-server-repository --src-dir docs/examples/model_repository/ +``` + +### OCI Model Repository +To load the model from the OCI Object Storage Bucket, you need to convert the following OCI credentials in the base64 format and add it to the values.yaml + +``` +echo -n 'REGION' | base64 +``` +``` +echo -n 'SECRECT_KEY_ID' | base64 +``` +``` +echo -n 'SECRET_ACCESS_KEY' | base64 +``` + +You also need to adapt _modelRepositoryPath_ in values.yaml to your [namespace](https://docs.oracle.com/en-us/iaas/Content/Object/Tasks/understandingnamespaces.htm) and [OCI region](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm). + +``` +s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository +``` + +## Deploy Prometheus and Grafana + +The inference server metrics are collected by Prometheus and viewable +by Grafana. The inference server helm chart assumes that Prometheus +and Grafana are available so this step must be followed even if you +don't want to use Grafana. + +Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) to install these components. The +*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that +Prometheus can find the inference server metrics in the *example* +release deployed below. + +``` +$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack +``` + +Then port-forward to the Grafana service so you can access it from +your local browser. + +``` +$ kubectl port-forward service/example-metrics-grafana 8080:80 +``` + +Now you should be able to navigate in your browser to localhost:8080 +and see the Grafana login page. Use username=admin and +password=prom-operator to login. + +Note that it is also possible to set a load balancer service for the grafana dashboard +by running: + +``` +$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false --set grafana.service.type=LoadBalancer prometheus-community/kube-prometheus-stack +``` + +You can then see the Public IP of you grafana dashboard by running: + +``` +$ kubectl get svc +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 2m33s +example-metrics-grafana LoadBalancer 10.96.82.33 141.145.220.114 80:31005/TCP 2m38s +``` + +The default load balancer created comes with a fixed shape and a bandwidth of 100Mbps. You can switch to a [flexible](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengcreatingloadbalancers-subtopic.htm#contengcreatingloadbalancers_subtopic) shape and adapt the bandwidth according to your OCI limits in case the bandwidth is a bottleneck. + + +An example Grafana dashboard is available in dashboard.json. Use the +import function in Grafana to import and view this dashboard. + +## Deploy the Inference Server + +Deploy the inference server using the default configuration with the +following commands. + +``` +$ cd <directory containing Chart.yaml> +$ helm install example . +``` + +Use kubectl to see status and wait until the inference server pods are +running. + +``` +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s +``` + +There are several ways of overriding the default configuration as +described in this [helm +documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing). + +You can edit the values.yaml file directly or you can use the *--set* +option to override a single parameter with the CLI. For example, to +deploy a cluster of four inference servers use *--set* to set the +replicaCount parameter. + +``` +$ helm install example --set replicaCount=4 . +``` + +You can also write your own "config.yaml" file with the values you +want to override and pass it to helm. + +``` +$ cat << EOF > config.yaml +namespace: MyCustomNamespace +image: + imageName: nvcr.io/nvidia/tritonserver:custom-tag + modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository +EOF +$ helm install example -f config.yaml . +``` + +## Using Triton Inference Server + +Now that the inference server is running you can send HTTP or GRPC +requests to it to perform inferencing. By default, the inferencing +service is exposed with a LoadBalancer service type. Use the following +to find the external IP for the inference server. In this case it is +34.83.9.133. + +``` +$ kubectl get services +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +... +example-triton-inference-server LoadBalancer 10.18.13.28 34.83.9.133 8000:30249/TCP,8001:30068/TCP,8002:32723/TCP 47m +``` + +The inference server exposes an HTTP endpoint on port 8000, and GRPC +endpoint on port 8001 and a Prometheus metrics endpoint on +port 8002. You can use curl to get the meta-data of the inference server +from the HTTP endpoint. + +``` +$ curl 34.83.9.133:8000/v2 +``` + +Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example +image classification client that can be used to perform inferencing +using image classification models being served by the inference +server. For example, + +``` +$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg +Request 0, batch size 1 +Image 'images/mug.jpg': + 504 (COFFEE MUG) = 0.723992 + 968 (CUP) = 0.270953 + 967 (ESPRESSO) = 0.00115997 +``` + +## Cleanup + +Once you've finished using the inference server you should use helm to +delete the deployment. + +``` +$ helm list +NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE +example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default +example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default + +$ helm uninstall example +$ helm uninstall example-metrics +``` + +For the Prometheus and Grafana services, you should [explicitly delete +CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart): + +``` +$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com +``` + +You may also want to delete the OCI bucket you created to hold the +model repository. + +``` +$ oci os bucket delete --bucket-name triton-inference-server-repository --empty +``` diff --git a/deploy/oci/dashboard.json b/deploy/oci/dashboard.json new file mode 100644 index 0000000000..8960b41d35 --- /dev/null +++ b/deploy/oci/dashboard.json @@ -0,0 +1,411 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.3.5" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "nv_inference_request_success", + "legendFormat": "Success {{instance}}", + "refId": "A" + }, + { + "expr": "nv_inference_request_failure", + "legendFormat": "Failure {{instance}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cumulative Inference Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateReds", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 7, + "legend": { + "show": false + }, + "options": {}, + "reverseYBuckets": false, + "targets": [ + { + "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Load Ratio (Total Time / Compute Time)", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Queue Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Compute Time (milliseconds)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Compute Time (ms)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 19, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Triton Inference Server", + "uid": "slEY4dsZk", + "version": 8 +} diff --git a/deploy/oci/templates/_helpers.tpl b/deploy/oci/templates/_helpers.tpl new file mode 100644 index 0000000000..6dba910012 --- /dev/null +++ b/deploy/oci/templates/_helpers.tpl @@ -0,0 +1,92 @@ +{{/* +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/}} + +{{/* vim: set filetype=mustache: */}} +{{/* +Create inference server name. +*/}} +{{- define "triton-inference-server.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "triton-inference-server.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* + Create inference server metrics service name and fullname derived from above and + truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics" -}} +{{- end -}} + +{{/* + Create inference server metrics monitor name and fullname derived from + above and truncated appropriately. +*/}} +{{- define "triton-inference-server-metrics-monitor.name" -}} +{{- $basename := include "triton-inference-server.name" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{- define "triton-inference-server-metrics-monitor.fullname" -}} +{{- $basename := include "triton-inference-server.fullname" . -}} +{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}} +{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "triton-inference-server.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/deploy/oci/templates/deployment.yaml b/deploy/oci/templates/deployment.yaml new file mode 100644 index 0000000000..f374bd181f --- /dev/null +++ b/deploy/oci/templates/deployment.yaml @@ -0,0 +1,100 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + template: + metadata: + labels: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} + + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.imageName }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + + resources: + limits: + nvidia.com/gpu: {{ .Values.image.numGpus }} + + args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}", + "--model-control-mode=poll", + "--repository-poll-secs=5"] + + env: + - name: AWS_DEFAULT_REGION + valueFrom: + secretKeyRef: + name: oci-credentials + key: OCI_DEFAULT_REGION + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: oci-credentials + key: OCI_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: oci-credentials + key: OCI_SECRET_ACCESS_KEY + + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + livenessProbe: + httpGet: + path: /v2/health/live + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + httpGet: + path: /v2/health/ready + port: http + + securityContext: + runAsUser: 1000 + fsGroup: 1000 diff --git a/deploy/oci/templates/secrets.yaml b/deploy/oci/templates/secrets.yaml new file mode 100644 index 0000000000..0546fdda9d --- /dev/null +++ b/deploy/oci/templates/secrets.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Secret +metadata: + name: oci-credentials +type: Opaque +data: + OCI_DEFAULT_REGION: {{ .Values.secret.region }} + OCI_ACCESS_KEY_ID: {{ .Values.secret.id }} + OCI_SECRET_ACCESS_KEY: {{ .Values.secret.key }} diff --git a/deploy/oci/templates/service.yaml b/deploy/oci/templates/service.yaml new file mode 100644 index 0000000000..3315fd77db --- /dev/null +++ b/deploy/oci/templates/service.yaml @@ -0,0 +1,91 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 8000 + targetPort: http + name: http-inference-server + - port: 8001 + targetPort: grpc + name: grpc-inference-server + - port: 8002 + targetPort: metrics + name: metrics-inference-server + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ template "triton-inference-server-metrics.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + annotations: + alpha.monitoring.coreos.com/non-namespaced: "true" +spec: + ports: + - name: metrics + port: 8080 + targetPort: metrics + protocol: TCP + selector: + app: {{ template "triton-inference-server.name" . }} + release: {{ .Release.Name }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "triton-inference-server-metrics-monitor.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + app: {{ template "triton-inference-server-metrics-monitor.name" . }} + chart: {{ template "triton-inference-server.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + selector: + matchLabels: + app: {{ template "triton-inference-server-metrics.name" . }} + endpoints: + - port: metrics + interval: 15s diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml new file mode 100644 index 0000000000..55b8193ee2 --- /dev/null +++ b/deploy/oci/values.yaml @@ -0,0 +1,41 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +replicaCount: 1 + +image: + imageName: nvcr.io/nvidia/tritonserver:24.09-py3 + pullPolicy: IfNotPresent + modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository + numGpus: 1 + +service: + type: LoadBalancer + +secret: + region: OCI_REGION + id: OCI_SECRET_KEY_ID + key: OCI_SECRET_ACCESS_KEY diff --git a/docker/README.third-party-src b/docker/README.third-party-src new file mode 100644 index 0000000000..85f17d11ee --- /dev/null +++ b/docker/README.third-party-src @@ -0,0 +1,5 @@ +This directory contains the licenses and source code for software +included in the Triton Inference Server build. To extract the files +use: + + $ tar zxf src.tar.gz diff --git a/docker/cpu_only/entrypoint.d/12-banner.sh b/docker/cpu_only/entrypoint.d/12-banner.sh new file mode 100755 index 0000000000..0b4adda84b --- /dev/null +++ b/docker/cpu_only/entrypoint.d/12-banner.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +prodname_uc=$(echo "${NVIDIA_PRODUCT_NAME}" | tr [:lower:] [:upper:] | sed 's/ /_/g' | sed 's/^NVIDIA_//') # Product name +_prodver="NVIDIA_${prodname_uc}_VERSION" # Container product version variable name +_compver="${prodname_uc}_VERSION" # Upstream component version variable name + +echo +echo "NVIDIA Release ${!_prodver} (build ${NVIDIA_BUILD_ID})" +[ -n "${!_compver}" ] && echo "${NVIDIA_PRODUCT_NAME} Version ${!_compver}" diff --git a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh new file mode 100755 index 0000000000..4caa8eeff7 --- /dev/null +++ b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +export TRITON_SERVER_CPU_ONLY=1 diff --git a/docker/cpu_only/nvidia_entrypoint.sh b/docker/cpu_only/nvidia_entrypoint.sh new file mode 100755 index 0000000000..82859d1bb6 --- /dev/null +++ b/docker/cpu_only/nvidia_entrypoint.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Gather parts in alpha order +shopt -s nullglob extglob +SCRIPT_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" +declare -a PARTS=( "${SCRIPT_DIR}/entrypoint.d"/*@(.txt|.sh) ) +shopt -u nullglob extglob + +# Execute the entrypoint parts +for file in "${PARTS[@]}"; do + case "${file}" in + *.txt) cat "${file}";; + *.sh) source "${file}";; + esac +done + +echo + +# This script can either be a wrapper around arbitrary command lines, +# or it will simply exec bash if no arguments were given +if [[ $# -eq 0 ]]; then + exec "/bin/bash" +else + exec "$@" +fi diff --git a/docker/entrypoint.d/10-banner.txt b/docker/entrypoint.d/10-banner.txt new file mode 100644 index 0000000000..56a8b28e55 --- /dev/null +++ b/docker/entrypoint.d/10-banner.txt @@ -0,0 +1,4 @@ + +============================= +== Triton Inference Server == +============================= diff --git a/docker/entrypoint.d/15-container-copyright.txt b/docker/entrypoint.d/15-container-copyright.txt new file mode 100644 index 0000000000..f0aacf2464 --- /dev/null +++ b/docker/entrypoint.d/15-container-copyright.txt @@ -0,0 +1,2 @@ + +Copyright (c) 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. diff --git a/docker/entrypoint.d/50-gpu-driver-check2.sh b/docker/entrypoint.d/50-gpu-driver-check2.sh new file mode 100755 index 0000000000..bc22dd55ad --- /dev/null +++ b/docker/entrypoint.d/50-gpu-driver-check2.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +if [[ "${NVIDIA_CPU_ONLY:-0}" == "1" ]]; then + export TRITON_SERVER_CPU_ONLY=1 +fi diff --git a/docker/entrypoint.d/56-network-driver-version-check.sh b/docker/entrypoint.d/56-network-driver-version-check.sh new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docker/entrypoint.d/56-network-driver-version-check.sh @@ -0,0 +1 @@ + diff --git a/docker/entrypoint.d/70-shm-check.sh b/docker/entrypoint.d/70-shm-check.sh new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docker/entrypoint.d/70-shm-check.sh @@ -0,0 +1 @@ + diff --git a/docker/entrypoint.d/99-check-run-aip-mode.sh b/docker/entrypoint.d/99-check-run-aip-mode.sh new file mode 100755 index 0000000000..ec9249e944 --- /dev/null +++ b/docker/entrypoint.d/99-check-run-aip-mode.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +# If detect Vertex AI environment, launch tritonserver with supplied arguments + +# This has the effect of "unshifting" the tritonserver command onto the front +# of $@ if AIP_MODE is nonempty; it will then be exec'd by entrypoint.sh +set -- ${AIP_MODE:+"/opt/tritonserver/bin/tritonserver"} "$@" diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve new file mode 100755 index 0000000000..e9abc00bf5 --- /dev/null +++ b/docker/sagemaker/serve @@ -0,0 +1,169 @@ +#!/bin/bash +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/ + +# Use 'ready' for ping check in single-model endpoint mode, and use 'live' for ping check in multi-model endpoint model +# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26 +if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then + SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE} +else + SAGEMAKER_TRITON_PING_MODE="ready" +fi + +# Note: in Triton on SageMaker, each model url is registered as a separate repository +# e.g., /opt/ml/models/<hash>/model. Specifying MME model repo path as /opt/ml/models causes Triton +# to treat it as an additional empty repository and changes +# the state of all models to be UNAVAILABLE in the model repository +# https://github.com/triton-inference-server/core/blob/main/src/model_repository_manager.cc#L914,L922 +# On Triton, this path will be a dummy path as it's mandatory to specify a model repo when starting triton +SAGEMAKER_MULTI_MODEL_REPO=/tmp/sagemaker + +SAGEMAKER_MODEL_REPO=${SAGEMAKER_SINGLE_MODEL_REPO} +is_mme_mode=false + +if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then + if [ "$SAGEMAKER_MULTI_MODEL" == "true" ]; then + mkdir -p ${SAGEMAKER_MULTI_MODEL_REPO} + SAGEMAKER_MODEL_REPO=${SAGEMAKER_MULTI_MODEL_REPO} + if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then + SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE} + else + SAGEMAKER_TRITON_PING_MODE="live" + fi + is_mme_mode=true + echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\"" + fi +fi + +SAGEMAKER_ARGS="--model-repository=${SAGEMAKER_MODEL_REPO}" +#Set model namespacing to true, but allow disabling if required +if [ -n "$SAGEMAKER_TRITON_DISABLE_MODEL_NAMESPACING" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-namespacing=${SAGEMAKER_TRITON_DISABLE_MODEL_NAMESPACING}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-namespacing=true" +fi +if [ -n "$SAGEMAKER_BIND_TO_PORT" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-port=${SAGEMAKER_BIND_TO_PORT}" +fi +if [ -n "$SAGEMAKER_SAFE_PORT_RANGE" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-safe-port-range=${SAGEMAKER_SAFE_PORT_RANGE}" +fi +if [ -n "$SAGEMAKER_TRITON_ALLOW_GRPC" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=${SAGEMAKER_TRITON_ALLOW_GRPC}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=false" +fi +if [ -n "$SAGEMAKER_TRITON_ALLOW_METRICS" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=${SAGEMAKER_TRITON_ALLOW_METRICS}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=false" +fi +if [ -n "$SAGEMAKER_TRITON_METRICS_PORT" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --metrics-port=${SAGEMAKER_TRITON_METRICS_PORT}" +fi +if [ -n "$SAGEMAKER_TRITON_GRPC_PORT" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --grpc-port=${SAGEMAKER_TRITON_GRPC_PORT}" +fi +if [ -n "$SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --buffer-manager-thread-count=${SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT}" +fi +if [ -n "$SAGEMAKER_TRITON_THREAD_COUNT" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-thread-count=${SAGEMAKER_TRITON_THREAD_COUNT}" +fi +# Enable verbose logging by default. If env variable is specified, use value from env variable +if [ -n "$SAGEMAKER_TRITON_LOG_VERBOSE" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-verbose=${SAGEMAKER_TRITON_LOG_VERBOSE}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-verbose=true" +fi +if [ -n "$SAGEMAKER_TRITON_LOG_INFO" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-info=${SAGEMAKER_TRITON_LOG_INFO}" +fi +if [ -n "$SAGEMAKER_TRITON_LOG_WARNING" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-warning=${SAGEMAKER_TRITON_LOG_WARNING}" +fi +if [ -n "$SAGEMAKER_TRITON_LOG_ERROR" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-error=${SAGEMAKER_TRITON_LOG_ERROR}" +fi +if [ -n "$SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-default-byte-size=${SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-default-byte-size=16777216" #16MB +fi +if [ -n "$SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-growth-byte-size=${SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE}" +else + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-growth-byte-size=1048576" #1MB +fi +if [ -n "$SAGEMAKER_TRITON_TENSORFLOW_VERSION" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=tensorflow,version=${SAGEMAKER_TRITON_TENSORFLOW_VERSION}" +fi +if [ -n "$SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT" ]; then + num_gpus=$(nvidia-smi -L | wc -l) + for ((i=0; i<${num_gpus}; i++)); do + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-load-gpu-limit ${i}:${SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT}" + done +fi +if [ -n "$SAGEMAKER_TRITON_ADDITIONAL_ARGS" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} ${SAGEMAKER_TRITON_ADDITIONAL_ARGS}" +fi + + +if [ "${is_mme_mode}" = false ] && [ -f "${SAGEMAKER_MODEL_REPO}/config.pbtxt" ]; then + echo "ERROR: Incorrect directory structure." + echo " Model directory needs to contain the top level folder" + exit 1 +fi + +if [ "${is_mme_mode}" = false ] && [ -n "$SAGEMAKER_TRITON_DEFAULT_MODEL_NAME" ]; then + if [ -d "${SAGEMAKER_MODEL_REPO}/$SAGEMAKER_TRITON_DEFAULT_MODEL_NAME" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}" + else + echo "ERROR: Directory with provided SAGEMAKER_TRITON_DEFAULT_MODEL_NAME ${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME} does not exist" + exit 1 + fi +elif [ "${is_mme_mode}" = false ]; then + MODEL_DIRS=(`find "${SAGEMAKER_MODEL_REPO}" -mindepth 1 -maxdepth 1 -type d -printf "%f\n"`) + case ${#MODEL_DIRS[@]} in + 0) echo "ERROR: No model found in model repository"; + exit 1 + ;; + 1) echo "WARNING: No SAGEMAKER_TRITON_DEFAULT_MODEL_NAME provided." + echo " Starting with the only existing model directory ${MODEL_DIRS[0]}"; + export SAGEMAKER_TRITON_DEFAULT_MODEL_NAME=${MODEL_DIRS[0]} + ;; + *) echo "ERROR: More than 1 model directory found in model repository." + echo " Either provide a single directory or set SAGEMAKER_TRITON_DEFAULT_MODEL_NAME to run the ensemble backend." + echo " Directories found in model repository: ${MODEL_DIRS[@]}"; + exit 1 + ;; + esac + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}" +fi + +tritonserver --allow-sagemaker=true --allow-http=false --model-control-mode=explicit $SAGEMAKER_ARGS diff --git a/docs/Dockerfile.docs b/docs/Dockerfile.docs new file mode 100644 index 0000000000..ba30a144ac --- /dev/null +++ b/docs/Dockerfile.docs @@ -0,0 +1,78 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FROM ubuntu:22.04 + +# various documentation dependencies +RUN apt-get update -q=2 \ + && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + doxygen \ + git \ + git-lfs \ + pandoc \ + python3-dev \ + python3-pip \ + ssh \ + unzip \ + wget \ + && rm -rf /var/lib/apt/lists/* + +# install protobuf +RUN wget https://github.com/google/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip -O /tmp/proto.zip \ + && unzip /tmp/proto.zip -d /usr/local \ + && rm /tmp/proto.zip + +# install pseudomuto/protoc-gen-doc +RUN wget https://github.com/pseudomuto/protoc-gen-doc/releases/download/v1.3.2/protoc-gen-doc-1.3.2.linux-amd64.go1.12.6.tar.gz -O /tmp/protoc-gen-doc.tar.gz \ + && tar -xvf /tmp/protoc-gen-doc.tar.gz --strip-components=1 -C /usr/local/bin/ \ + && rm /tmp/protoc-gen-doc.tar.gz + +# install sphinx et al +RUN pip3 install \ + ablog \ + attrs \ + breathe \ + docutils \ + exhale \ + ipython \ + myst-nb \ + nbclient \ + nbsphinx \ + rst-to-myst \ + sphinx==5.0.0 \ + sphinx-book-theme \ + sphinx-copybutton \ + sphinx-design \ + sphinx-prompt \ + sphinx-sitemap \ + sphinx-tabs \ + sphinxcontrib-bibtex + +# Set visitor script to be included on every HTML page +ENV VISITS_COUNTING_SCRIPT="//assets.adobedtm.com/b92787824f2e0e9b68dc2e993f9bd995339fe417/satelliteLib-7ba51e58dc61bcb0e9311aadd02a0108ab24cc6c.js" + diff --git a/docs/Makefile b/docs/Makefile index fb11718781..b8cf4b654b 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,34 +24,35 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# Makefile for Sphinx documentation +# Minimal makefile for Sphinx documentation # -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SPHINXPROJ = TRTIS -SOURCEDIR = . -BUILDDIR = build -EXHALEDIRS = cpp_api doxyoutput -PROTOBUFFILES = $(wildcard ../src/core/*.proto) +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = build +TRITONCLIENTRSTDIR = _reference/tritonclient + +#PROTOBUFFILES = $(wildcard ../triton/proto/*.proto) # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) clean: - @rm -fr $(BUILDDIR) $(EXHALEDIRS) - @rm -f protobuf_api/*.proto.rst + @rm -fr ${BUILDDIR} + @rm -fr ${TRITONCLIENTRSTDIR} + +.PHONY: help Makefile clean -protobufdoc: protobuf_api/gen_proto_doc.sh - cd protobuf_api && \ - rm -f *.proto.rst && \ - bash -x ./gen_proto_doc.sh $(PROTOBUFFILES:%=../%) +# protobuf: source/reference/protos/gen_proto_doc.sh +# cd source/reference/protos && \ +# rm -f *.proto.rst && \ +# bash -x ./gen_proto_doc.sh $(PROTOBUFFILES:%=../%) # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile protobufdoc +%: @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help clean protobufdoc Makefile diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000..0f9faba3fe --- /dev/null +++ b/docs/README.md @@ -0,0 +1,218 @@ +<!-- +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> + +# **Triton Inference Server Documentation** + +| [Installation](README.md#installation) | [Getting Started](README.md#getting-started) | [User Guide](README.md#user-guide) | [API Guide](protocol/README.md) | [Additional Resources](README.md#resources) | [Customization Guide](README.md#customization-guide) | +| ------------ | --------------- | --------------- | ------------ | --------------- | --------------- | + +**New to Triton Inference Server?** Make use of +[these tutorials](https://github.com/triton-inference-server/tutorials) + to begin your Triton journey! + +## **Installation** +Before you can use the Triton Docker image you must install +[Docker](https://docs.docker.com/engine/install). If you plan on using +a GPU for inference you must also install the [NVIDIA Container +Toolkit](https://github.com/NVIDIA/nvidia-docker). DGX users should +follow [Preparing to use NVIDIA +Containers](http://docs.nvidia.com/deeplearning/dgx/preparing-containers/index.html). + +Pull the image using the following command. + +``` +$ docker pull nvcr.io/nvidia/tritonserver:<yy.mm>-py3 +``` + +Where \<yy.mm\> is the version of Triton that you want to pull. For a complete list of all the variants and versions of the Triton Inference Server Container, visit the [NGC Page](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver). More information about customizing the Triton Container can be found in [this section](customization_guide/compose.md) of the User Guide. + +## **Getting Started** + +This guide covers the simplest possible workflow for deploying a model using a Triton Inference Server. +- [Create a Model Repository](getting_started/quickstart.md#create-a-model-repository) +- [Launch Triton](getting_started/quickstart.md#launch-triton) +- [Send an Inference Request](getting_started/quickstart.md#send-an-inference-request) + +Triton Inference Server has a considerable list versatile and powerful features. All new users are recommended to explore the [User Guide](README.md#user-guide) and the [additional resources](README.md#resources) sections for features most relevant to their use case. + +## **User Guide** +The User Guide describes how to configure Triton, organize and configure your models, use the C++ and Python clients, etc. This guide includes the following: +* Creating a Model Repository [[Overview](README.md#model-repository) || [Details](user_guide/model_repository.md)] +* Writing a Model Configuration [[Overview](README.md#model-configuration) || [Details](user_guide/model_configuration.md)] +* Buillding a Model Pipeline [[Overview](README.md#model-pipeline)] +* Managing Model Availability [[Overview](README.md#model-management) || [Details](user_guide/model_management.md)] +* Collecting Server Metrics [[Overview](README.md#metrics) || [Details](user_guide/metrics.md)] +* Supporting Custom Ops/layers [[Overview](README.md#framework-custom-operations) || [Details](user_guide/custom_operations.md)] +* Using the Client API [[Overview](README.md#client-libraries-and-examples) || [Details](https://github.com/triton-inference-server/client)] +* Cancelling Inference Requests [[Overview](README.md#cancelling-inference-requests) || [Details](user_guide/request_cancellation.md)] +* Analyzing Performance [[Overview](README.md#performance-analysis)] +* Deploying on edge (Jetson) [[Overview](README.md#jetson-and-jetpack)] +* Debugging Guide [Details](./user_guide/debugging_guide.md) + +### Model Repository +[Model Repositories](user_guide/model_repository.md) are the organizational hub for using Triton. All models, configuration files, and additional resources needed to serve the models are housed inside a model repository. +- [Cloud Storage](user_guide/model_repository.md#model-repository-locations) +- [File Organization](user_guide/model_repository.md#model-files) +- [Model Versioning](user_guide/model_repository.md#model-versions) +### Model Configuration + +A [Model Configuration](user_guide/model_configuration.md) file is where you set the model-level options, such as output tensor reshaping and dynamic batch sizing. + +#### Required Model Configuration + +Triton Inference Server requires some [Minimum Required parameters](user_guide/model_configuration.md#minimal-model-configuration) to be filled in the Model Configuration. These required parameters essentially pertain to the structure of the model. For TensorFlow, ONNX and TensorRT models, users can rely on Triton to [Auto Generate](user_guide/model_configuration.md#auto-generated-model-configuration) the Minimum Required model configuration. +- [Maximum Batch Size - Batching and Non-Batching Models](user_guide/model_configuration.md#maximum-batch-size) +- [Input and Output Tensors](user_guide/model_configuration.md#inputs-and-outputs) + - [Tensor Datatypes](user_guide/model_configuration.md#datatypes) + - [Tensor Reshape](user_guide/model_configuration.md#reshape) + - [Shape Tensor](user_guide/model_configuration.md#shape-tensors) + +#### Versioning Models +Users need the ability to save and serve different versions of models based on business requirements. Triton allows users to set policies to make available different versions of the model as needed. [Learn More](user_guide/model_configuration.md#version-policy). + +#### Instance Groups +Triton allows users to use of multiple instances of the same model. Users can specify how many instances (copies) of a model to load and whether to use GPU or CPU. If the model is being loaded on GPU, users can also select which GPUs to use. [Learn more](user_guide/model_configuration.md#instance-groups). +- [Specifying Multiple Model Instances](user_guide/model_configuration.md#multiple-model-instances) +- [CPU and GPU Instances](user_guide/model_configuration.md#cpu-model-instance) +- [Configuring Rate Limiter](user_guide/model_configuration.md#rate-limiter-configuration) + +#### Optimization Settings + +The Model Configuration ModelOptimizationPolicy property is used to specify optimization and prioritization settings for a model. These settings control if/how a model is optimized by the backend and how it is scheduled and executed by Triton. See the [ModelConfig Protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) and [Optimization Documentation](user_guide/optimization.md#optimization) for the currently available settings. +- [Framework-Specific Optimization](user_guide/optimization.md#framework-specific-optimization) + - [ONNX-TensorRT](user_guide/optimization.md#onnx-with-tensorrt-optimization-ort-trt) + - [ONNX-OpenVINO](user_guide/optimization.md#onnx-with-openvino-optimization) + - [TensorFlow-TensorRT](user_guide/optimization.md#tensorflow-with-tensorrt-optimization-tf-trt) + - [TensorFlow-Mixed-Precision](user_guide/optimization.md#tensorflow-automatic-fp16-optimization) +- [NUMA Optimization](user_guide/optimization.md#numa-optimization) + +#### Scheduling and Batching + +Triton supports batching individual inference requests to improve compute resource utilization. This is extremely important as individual requests typically will not saturate GPU resources thus not leveraging the parallelism provided by GPUs to its extent. Learn more about Triton's [Batcher and Scheduler](user_guide/model_configuration.md#scheduling-and-batching). +- [Default Scheduler - Non-Batching](user_guide/model_configuration.md#default-scheduler) +- [Dynamic Batcher](user_guide/model_configuration.md#dynamic-batcher) + - [How to Configure Dynamic Batcher](user_guide/model_configuration.md#recommended-configuration-process) + - [Delayed Batching](user_guide/model_configuration.md#delayed-batching) + - [Preferred Batch Size](user_guide/model_configuration.md#preferred-batch-sizes) + - [Preserving Request Ordering](user_guide/model_configuration.md#preserve-ordering) + - [Priority Levels](user_guide/model_configuration.md#priority-levels) + - [Queuing Policies](user_guide/model_configuration.md#queue-policy) + - [Ragged Batching](user_guide/ragged_batching.md) +- [Sequence Batcher](user_guide/model_configuration.md#sequence-batcher) + - [Stateful Models](user_guide/architecture.md#stateful-models) + - [Control Inputs](user_guide/architecture.md#control-inputs) + - [Implicit State - Stateful Inference Using a Stateless Model](user_guide/architecture.md#implicit-state-management) + - [Sequence Scheduling Strategies](user_guide/architecture.md#scheduling-strategies) + - [Direct](user_guide/architecture.md#direct) + - [Oldest](user_guide/architecture.md#oldest) + +#### Rate Limiter +Rate limiter manages the rate at which requests are scheduled on model instances by Triton. The rate limiter operates across all models loaded in Triton to allow cross-model prioritization. [Learn more](user_guide/rate_limiter.md). + +#### Model Warmup +For a few of the Backends (check [Additional Resources](README.md#resources)) some or all of initialization is deferred until the first inference request is received, the benefit is resource conservation but comes with the downside of the initial requests getting processed slower than expected. Users can pre-"warm up" the model by instructing Triton to initialize the model. [Learn more](user_guide/model_configuration.md#model-warmup). + +#### Inference Request/Response Cache +Triton has a feature which allows inference responses to get cached. [Learn More](user_guide/response_cache.md). + +### Model Pipeline +Building ensembles is as easy as adding an addition configuration file which outlines the specific flow of tensors from one model to another. Any additional changes required by the model ensemble can be made in existing (individual) model configurations. +- [Model Ensemble](user_guide/architecture.md#ensemble-models) +- [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting) +### Model Management +Users can specify policies in the model configuration for loading and unloading of models. This [section](user_guide/model_management.md) covers user selectable policy details. +- [Explicit Model Loading and Unloading](user_guide/model_management.md#model-control-mode-explicit) +- [Modifying the Model Repository](user_guide/model_management.md#modifying-the-model-repository) +### Metrics +Triton provides Prometheus metrics like GPU Utilization, Memory Usage, Latency and more. Learn about [available metrics](user_guide/metrics.md). +### Framework Custom Operations +Some frameworks provide the option of building custom layers/operations. These can be added to specific Triton Backends for the those frameworks. [Learn more](user_guide/custom_operations.md) +- [TensorRT](user_guide/custom_operations.md#tensorrt) +- [TensorFlow](user_guide/custom_operations.md#tensorflow) +- [PyTorch](user_guide/custom_operations.md#pytorch) +- [ONNX](user_guide/custom_operations.md#onnx) +### Client Libraries and Examples +Use the [Triton Client](https://github.com/triton-inference-server/client) API to integrate client applications over the network HTTP/gRPC API or integrate applications directly with Triton using CUDA shared memory to remove network overhead. +- [C++ HTTP/GRPC Libraries](https://github.com/triton-inference-server/client#client-library-apis) +- [Python HTTP/GRPC Libraries](https://github.com/triton-inference-server/client#client-library-apis) +- [Java HTTP Library](https://github.com/triton-inference-server/client/tree/main/src/java) +- GRPC Generated Libraries + - [go](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/go) + - [Java/Scala](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/java) + - [Javascript](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/javascript) +- [Shared Memory Extension](protocol/extension_shared_memory.md) +### Cancelling Inference Requests +Triton can detect and handle requests that have been cancelled from the client-side. This [document](user_guide/request_cancellation.md) discusses scope and limitations of the feature. +### Performance Analysis +Understanding Inference performance is key to better resource utilization. Use Triton's Tools to costomize your deployment. +- [Performance Tuning Guide](user_guide/performance_tuning.md) +- [Optimization](user_guide/optimization.md) +- [Model Analyzer](user_guide/model_analyzer.md) +- [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) +- [Inference Request Tracing](user_guide/trace.md) +### Jetson and JetPack +Triton can be deployed on edge devices. Explore [resources](user_guide/jetson.md) and [examples](examples/jetson/README.md). + +## **Resources** + +The following resources are recommended to explore the full suite of Triton Inference Server's functionalities. +- **Clients**: Triton Inference Server comes with C++, Python and Java APIs with which users can send HTTP/REST or gRPC(possible extensions for other languages) requests. Explore the [client repository](https://github.com/triton-inference-server/server/tree/main/docs/protocol) for examples and documentation. + +- **Configuring Deployment**: Triton comes with three tools which can be used to configure deployment setting, measure performance and recommend optimizations. + - [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) Model Analyzer is CLI tool built to recommend deployment configurations for Triton Inference Server based on user's Quality of Service Requirements. It also generates detailed reports about model performance to summarize the benefits and trade offs of different configurations. + - [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md): + Perf Analyzer is a CLI application built to generate inference requests and + measures the latency of those requests and throughput of the model being + served. + - [Model Navigator](https://github.com/triton-inference-server/model_navigator): + The Triton Model Navigator is a tool that provides the ability to automate the process of moving model from source to optimal format and configuration for deployment on Triton Inference Server. The tool supports export model from source to all possible formats and applies the Triton Inference Server backend optimizations. + +- **Backends**: Triton supports a wide variety of frameworks used to run models. Users can extend this functionality by creating custom backends. + - [PyTorch](https://github.com/triton-inference-server/pytorch_backend): Widely used Open Source DL Framework + - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend): Widely used Open Source DL Framework + - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend): NVIDIA [TensorRT](https://developer.nvidia.com/tensorrt) is an inference acceleration SDK that provide a with range of graph optimizations, kernel optimization, use of lower precision, and more. + - [ONNX](https://github.com/triton-inference-server/onnxruntime_backend): ONNX Runtime is a cross-platform inference and training machine-learning accelerator. + - [OpenVINO](https://github.com/triton-inference-server/openvino_backend): OpenVINO™ is an open-source toolkit for optimizing and deploying AI inference. + - [Paddle Paddle](https://github.com/triton-inference-server/paddlepaddle_backend): Widely used Open Source DL Framework + - [Python](https://github.com/triton-inference-server/python_backend): Users can add custom business logic, or any python code/model for serving requests. + - [Forest Inference Library](https://github.com/triton-inference-server/fil_backend): Backend built for forest models trained by several popular machine learning frameworks (including XGBoost, LightGBM, Scikit-Learn, and cuML) + - [DALI](https://github.com/triton-inference-server/dali_backend): NVIDIA [DALI](https://developer.nvidia.com/dali) is a Data Loading Library purpose built to accelerated pre-processing and data loading steps in a Deep Learning Pipeline. + - [HugeCTR](https://github.com/triton-inference-server/hugectr_backend): HugeCTR is a GPU-accelerated recommender framework designed to distribute training across multiple GPUs and nodes and estimate Click-Through Rates + - [Managed Stateful Models](https://github.com/triton-inference-server/stateful_backend): This backend automatically manages the input and output states of a model. The states are associated with a sequence id and need to be tracked for inference requests associated with the sequence id. + - [Faster Transformer](https://github.com/triton-inference-server/fastertransformer_backend): NVIDIA [FasterTransformer](https://github.com/NVIDIA/FasterTransformer/) (FT) is a library implementing an accelerated engine for the inference of transformer-based neural networks, with a special emphasis on large models, spanning many GPUs and nodes in a distributed manner. + - [Building Custom Backends](https://github.com/triton-inference-server/backend/tree/main/examples#tutorial) + - [Sample Custom Backend: Repeat_backend](https://github.com/triton-inference-server/repeat_backend): Backend built to demonstrate sending of zero, one, or multiple responses per request. + +## **Customization Guide** +This guide describes how to build and test Triton and also how Triton can be extended with new functionality. + +- [Build](customization_guide/build.md) +- [Protocols and APIs](customization_guide/inference_protocols.md). +- [Backends](https://github.com/triton-inference-server/backend) +- [Repository Agents](customization_guide/repository_agents.md) +- [Test](customization_guide/test.md) diff --git a/docs/_reference/tritonclient_api.rst b/docs/_reference/tritonclient_api.rst new file mode 100644 index 0000000000..5dcfdee7d4 --- /dev/null +++ b/docs/_reference/tritonclient_api.rst @@ -0,0 +1,37 @@ +.. + # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions + # are met: + # * Redistributions of source code must retain the above copyright + # notice, this list of conditions and the following disclaimer. + # * Redistributions in binary form must reproduce the above copyright + # notice, this list of conditions and the following disclaimer in the + # documentation and/or other materials provided with the distribution. + # * Neither the name of NVIDIA CORPORATION nor the names of its + # contributors may be used to endorse or promote products derived + # from this software without specific prior written permission. + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Python tritonclient Package API +=============================== + +tritonclient python package is hosted at the `pyPI.org <https://pypi.org/project/tritonclient/>`_. This package documentation for tritonclient is genenerated by sphinx autosummary extension. + +.. autosummary:: + :toctree: tritonclient + :recursive: + + tritonclient diff --git a/docs/_static/.gitattributes b/docs/_static/.gitattributes new file mode 100644 index 0000000000..04865f126a --- /dev/null +++ b/docs/_static/.gitattributes @@ -0,0 +1,2 @@ +nvidia-logo-horiz-rgb-blk-for-screen.png filter=lfs diff=lfs merge=lfs -text +nvidia-logo-vert-rgb-blk-for-screen.png filter=lfs diff=lfs merge=lfs -text diff --git a/docs/_static/NVIDIA-LogoBlack.svg b/docs/_static/NVIDIA-LogoBlack.svg new file mode 100644 index 0000000000..c612396c71 --- /dev/null +++ b/docs/_static/NVIDIA-LogoBlack.svg @@ -0,0 +1 @@ +<svg id="NVIDIA_Logo_V" data-name="NVIDIA Logo V" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1211.808 415.949"><defs><style>.cls-1{fill:none;}</style></defs><title>NVIDIA-LogoBlack \ No newline at end of file diff --git a/docs/_static/custom.css b/docs/_static/custom.css new file mode 100644 index 0000000000..46bab57d4e --- /dev/null +++ b/docs/_static/custom.css @@ -0,0 +1,319 @@ +/* +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/5/2/52891dda673228d54e5d57bf1e4a3880d4b22405.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/0/e090b7dda7a582522c7f9045c6ce949cce60134f.woff) format("woff"); + font-weight: 300; + font-style: normal; +} +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/a/1/a107baabcbf6b241099122336bce7429bcfd377a.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/3/a/3a6060a4e3bce70e5552ba0de8af4b22c6cf9144.woff) format("woff"); + font-weight: 300; + font-style: italic; +} +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/9/9/9920d2b172b01d92fc9c1c0e521dcf45b59c47c3.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/6/c/6c7d947928a7e4ef3e80ed409bef6c243f2148cb.woff) format("woff"); + font-weight: 400; + font-style: normal; +} +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/8/e8e63fe1244372cd942d957f44a5616a1eba0644.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/0/f/0f1fb2af0283ab09d36e7097bb07d895c3228f12.woff) format("woff"); + font-weight: 400; + font-style: italic; +} +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/7/9/79d3c513a9cd72c59f65354f39f89ca52dc17dd2.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/2/5/2581ac533f5d01f4985d8a7245b0766b4630ced8.woff) format("woff"); + font-weight: 500; + font-style: normal; +} +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/3/9/39d9ef1ee9770dd503f19bb2ace2fdb4eff3bb50.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/7/b/7bb5d5e2e71b2e13c8098b2e67c0a0ed9258e6c7.woff) format("woff"); + font-weight: 500; + font-style: italic; +} +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/0/5/05276a55a43eb3f74981ec1e93252727afcd9d16.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/9/c/9cfec7ed941b06564aa4d5ca14610e81542d070f.woff) format("woff"); + font-weight: 700; + font-style: normal; +} +@font-face { + font-family: "NVIDIA Sans"; + src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/a/e/aebd14d09ba56f541e1b8735fb051e33710f9ae7.woff2) format("woff2"), + url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/d/edbdabef43acc5c12e84a94baaa5542c9404cfeb.woff) format("woff"); + font-weight: 700; + font-style: italic; +} + +/* Custom Styles */ +:root { +--pst-font-size-base: none; +--pst-color-primary: 0, 133, 197; +--pst-color-admonition-note: var(--pst-color-primary); +--pst-color-admonition-default: var(--pst-color-primary); +--pst-color-info: 255, 193, 7; +--pst-color-admonition-tip: var(--pst-color-info); +--pst-color-admonition-hint: var(--pst-color-info); +--pst-color-admonition-important: var(--pst-color-info); +--pst-color-warning: 245, 162, 82; +--pst-color-danger: 230, 101, 129; +--pst-color-admonition-warning: var(--pst-color-danger); +--pst-color-link: 118, 185, 0; +--pst-color-inline-code: 92, 22, 130; +--font-family-sans-serif: NVIDIA Sans, Helvetica, Arial, Sans-serif; +--pst-font-family-base-system: NVIDIA Sans, Helvetica, Arial, Sans-serif; +font-family: NVIDIA Sans, Helvetica, Arial, Sans-serif; +} + +.prev-next-area { + font-size: small; +} + +.docutils caption { + caption-side: top; +} + +#site-navigation h1.site-logo { + font-size: 0.85em; +} + +/* colors +nv green 118,185,0 +black 0, 0, 0 +light gray 205, 205, 205 +medium gray 140, 140, 140 +dark gray 94, 94, 94 + +emerald 0, 133, 100 +emerald #008564 +amethyst 92, 22, 130 +amethyst #5C1682 +cpu blue 0, 133, 197 +cpu blue #0085C5 +garnet 137, 12, 88 +garnet 890C58 +fluorite 250, 194, 0 +fluorite FAC200 +*/ + +:root { + --nv-green: #76b900; + --nv-green-darken: #6ead00; + --emerald: #008564; + --emerald-darken: #017c5d; + --amethyst: #5d1682; + --amethyst-darken: #4c116b; + --cpu-blue: #0071c5; + --cpu-blue-darken: #0062ad; + --garnet: #890c58; + --garnet-darken: #7a0c4e; + --fluorite: #fac200; + --fluorite-darken: #e4b301; + --dark-gray: #5e5e5e; + --light-gray: #cdcdcd; + --medium-gray: #8c8c8c; + --medium-gray-darken: #8c8c8cde; + --primary: #76b900; + --secondary: #008564; + --success: #5d1682; + --info: #0071c5; + --warning: #fac200; + --danger: #890c58; +} + +/* Riva TBYB (ASR and TTS) Styling */ +.demo-box { + background-color: rgb(245,245,245); +} +a:link { text-decoration: none; } +.scrollable { + height: 125px; + overflow-y: auto; + font-size: 1.3rem; +} +.dot { + height: 8px; + width: 8px; + background-color: rgb(228, 77, 77); + border-radius: 50%; + display: inline-block; +} +.timer { + font-size: 80%; + text-transform: uppercase; + white-space: nowrap; +} +.form-select { + border-radius: 0%; + font-size: 80%; +} +.form-control { + border-radius: 0%; +} +.input-group-text { + border-radius: 0%; + font-size: 80%; + text-transform: uppercase; + background-color: rgb(245,245,245); +} +.card { + border-radius: 0%; +} +.speech-control { + border-top-width: 0px; +} +.btn { + border-radius: 0%; + font-size: 80%; + text-transform: uppercase; + white-space: nowrap; + min-width: 125px; +} +.btn-primary { + background-color: var(--nv-green); + border-color: var(--nv-green); +} +.btn-primary:hover { + background-color: var(--nv-green-darken); + border-color: var(--nv-green-darken); +} +.btn-primary:focus, .btn-primary.focus { + background-color: var(--nv-green-darken); + border-color: var(--nv-green-darken); + -webkit-box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5); + box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5); +} +.btn-primary.disabled, .btn-primary:disabled { + background-color: var(--nv-green); + border-color: var(--nv-green); +} +.btn-primary:not(:disabled):not(.disabled):active, .btn-primary:not(:disabled):not(.disabled).active, +.show > .btn-primary.dropdown-toggle { + background-color: var(--nv-green-darken); + border-color: var(--nv-green-darken); +} +.btn-primary:not(:disabled):not(.disabled):active:focus, .btn-primary:not(:disabled):not(.disabled).active:focus, +.show > .btn-primary.dropdown-toggle:focus { + -webkit-box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5); + box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5); +} +.btn-secondary { + background-color: var(--medium-gray); + border-color: var(--medium-gray); +} +.btn-secondary:hover { + background-color: var(--medium-gray-darken); + border-color: var(--medium-gray-darken); +} +.btn-secondary:focus, .btn-secondary.focus { + background-color: var(--medium-gray-darken); + border-color: var(--medium-gray-darken); + -webkit-box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5); + box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5); +} +.btn-secondary.disabled, .btn-secondary:disabled { + background-color: var(--medium-gray); + border-color: var(--medium-gray); +} +.btn-secondary:not(:disabled):not(.disabled):active, .btn-secondary:not(:disabled):not(.disabled).active, +.show > .btn-secondary.dropdown-toggle { + background-color: var(--medium-gray-darken); + border-color: var(--medium-gray-darken); +} +.btn-secondary:not(:disabled):not(.disabled):active:focus, .btn-secondary:not(:disabled):not(.disabled).active:focus, +.show > .btn-secondary.dropdown-toggle:focus { + -webkit-box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5); + box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5); +} +.btn-link { + color: var(--nv-green); + text-decoration-line: none; +} +.btn-link:hover { + color: var(--nv-green-darken); +} +.btn-link:focus, .btn-link.focus { + color: var(--nv-green-darken); + -webkit-box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5); + box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5); +} +.link-primary { + color: var(--nv-green); +} +.link-primary:hover { + color: var(--nv-green-darken); +} + +/* Riva ASR Styles */ +#riva-upload-label { + margin-top: 0.5rem; +} + +/* Riva TTS Styles */ +.tts-control { + justify-content: space-between; + align-items: center; +} + +.tts-control > p { + margin: unset; +} + +#riva-tts-field { + resize: none; + border: unset; + padding: 0; + height: 100%; + font-size: 1.0rem; +} + +#riva-terms-of-use p { + max-width: 620px; +} + +/* Media Queries */ +@media (max-width: 1024px) { + + /* Riva TTS and ASR */ + .scrollable { + height: 250px; + } +} + diff --git a/docs/_static/logo_2color_horizontal.svg b/docs/_static/logo_2color_horizontal.svg new file mode 100644 index 0000000000..56b0a254e5 --- /dev/null +++ b/docs/_static/logo_2color_horizontal.svg @@ -0,0 +1,2 @@ + + diff --git a/docs/_static/logo_2color_vertical.svg b/docs/_static/logo_2color_vertical.svg new file mode 100644 index 0000000000..69e64b7001 --- /dev/null +++ b/docs/_static/logo_2color_vertical.svg @@ -0,0 +1,2 @@ + + diff --git a/docs/_static/nvidia-logo-horiz-rgb-blk-for-screen.png b/docs/_static/nvidia-logo-horiz-rgb-blk-for-screen.png new file mode 100644 index 0000000000..6316a9340f --- /dev/null +++ b/docs/_static/nvidia-logo-horiz-rgb-blk-for-screen.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd57ffce985e08c97c6af5fdadd2a28e4a92996455edc2d0598dd964cca51eae +size 48928 diff --git a/docs/_static/nvidia-logo-vert-rgb-blk-for-screen.png b/docs/_static/nvidia-logo-vert-rgb-blk-for-screen.png new file mode 100644 index 0000000000..5546c1b57d --- /dev/null +++ b/docs/_static/nvidia-logo-vert-rgb-blk-for-screen.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17a25111e145aa52b77ec5a89eb3b0c7d9a2a90dea25a0bb867a937514fc783c +size 63541 diff --git a/docs/_static/rtd-data.js b/docs/_static/rtd-data.js new file mode 100644 index 0000000000..7ed13e8ee0 --- /dev/null +++ b/docs/_static/rtd-data.js @@ -0,0 +1,36 @@ +/* +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +// Dummy data for testing ReadTheDocs footer insertion +// This mimics RTD data for a project that uses both versions + languages +var READTHEDOCS_DATA = { + project: "frc-docs", + version: "latest", + language: "en", + proxied_api_host: "https://readthedocs.org", +}; diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 0000000000..570aba8ba3 --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,31 @@ + +{% extends "!layout.html" %} +{%- block footer %} + +{%- endblock %} diff --git a/docs/architecture.rst b/docs/architecture.rst deleted file mode 100644 index 75da5b0574..0000000000 --- a/docs/architecture.rst +++ /dev/null @@ -1,118 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Architecture -============ - -The following figure shows the TensorRT Inference Server high-level -architecture. The :ref:`model repository ` -is a file-system based store of the models that TRTIS will make -available for inferencing. Inference requests arrive at the server via -either :ref:`HTTP or GRPC ` and are then -routed to the appropriate per-model scheduler queue. The scheduler -performs fair scheduling and dynamic batching for each model’s -requests. The schedule passes each request to the framework backend -corresponding to the model type. The framework backend performs -inferencing using the inputs provided in the request to produce the -requested outputs. The outputs are then formatted and a response is -sent. - -.. image:: images/arch.png - -.. _section-concurrent-model-execution: - -Concurrent Model Execution --------------------------- - -The TRTIS architecture allows multiple models and/or multiple -instances of the same model to execute in parallel on a single -GPU. The following figure shows an example with two models; model0 and -model1. Assuming TRTIS is not currently processing any request, when -two requests arrive simultaneously, one for each model, TRTIS -immediately schedules both of them onto the GPU and the GPU’s hardware -scheduler begins working on both computations in parallel. - -.. image:: images/multi_model_exec.png - -By default, if multiple requests for the same model arrive at the same -time, TRTIS will serialize their execution by scheduling only one at a -time on the GPU, as shown in the following figure. - -.. image:: images/multi_model_serial_exec.png - -The TensorRT inference server provides an :ref:`instance-group -` feature that allows each model to specify -how many parallel executions of that model should be allowed. Each -such enabled parallel execution is referred to as an *execution -instance*. By default, TRTIS gives each model a single execution -instance, which means that only a single execution of the model is -allowed to be in progress at a time as shown in the above figure. By -using instance-group the number of execution instances for a model can -be increased. The following figure shows model execution when model1 -is configured to allow three execution instances. As shown in the -figure, the first three model1 inference requests are immediately -executed in parallel on the GPU. The fourth model1 inference request -must wait until one of the first three executions completes before -beginning. - -.. image:: images/multi_model_parallel_exec.png - -To provide the current model execution capabilities shown in the above -figures, TRTIS uses `CUDA streams -`_ -to exploit the GPU’s hardware scheduling capabilities. CUDA streams -allow TRTIS to communicate independent sequences of memory-copy and -kernel executions to the GPU. The hardware scheduler in the GPU takes -advantage of the independent execution streams to fill the GPU with -independent memory-copy and kernel executions. For example, using -streams allows the GPU to execute a memory-copy for one model, a -kernel for another model, and a different kernel for yet another model -at the same time. - -The following figure shows some details of how this works within the -TensorRT Inference Server. Each framework backend (TensorRT, -TensorFlow, Caffe2) provides an API for creating an execution context -that is used to execute a given model (each framework uses different -terminology for this concept but here we refer to them generally as -execution contexts). Each framework allows an execution context to be -associated with a CUDA stream. This CUDA stream is used by the -framework to execute all memory copies and kernels needed for the -model associated with the execution context. For a given model, TRTIS -creates one execution context for each execution instance specified -for the model. When an inference request arrives for a given model, -that request is queued in the model scheduler associated with that -model. The model scheduler waits for any execution context associated -with that model to be idle and then sends the queued request to the -context. The execution context then issues all the memory copies and -kernel executions required to execute the model to the CUDA stream -associated with that execution context. The memory copies and kernels -in each CUDA stream are independent of memory copies and kernels in -other CUDA streams. The GPU hardware scheduler looks across all CUDA -streams to find independent memory copies and kernels to execute on -the GPU. - -.. image:: images/cuda_stream_exec.png diff --git a/docs/build.rst b/docs/build.rst deleted file mode 100644 index 9e9d7f1ce5..0000000000 --- a/docs/build.rst +++ /dev/null @@ -1,121 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Building -======== - -The TensorRT Inference Server is built using Docker and the TensorFlow -and PyTorch containers from `NVIDIA GPU Cloud (NGC) -`_. Before building you must install Docker -and nvidia-docker and login to the NGC registry by following the -instructions in :ref:`section-installing-prebuilt-containers`. - -.. _section-building-the-server: - -Building the Server -------------------- - -To build a release version of the TRTIS container, change directory to -the root of the repo and issue the following command:: - - $ docker build --pull -t tensorrtserver . - -Incremental Builds -^^^^^^^^^^^^^^^^^^ - -For typical development you will want to run the *build* container -with your local repo’s source files mounted so that your local changes -can be incrementally built. This is done by first building the -*tensorrtserver_build* container:: - - $ docker build --pull -t tensorrtserver_build --target trtserver_build . - -By mounting /path/to/tensorrtserver/src into the container at -/workspace/src, changes to your local repo will be reflected in the -container:: - - $ nvidia-docker run -it --rm -v/path/to/tensorrtserver/src:/workspace/src tensorrtserver_build - -Within the container you can perform an incremental server build -with:: - - # cd /workspace - # bazel build -c opt --config=cuda src/servers/trtserver - # cp /workspace/bazel-bin/src/servers/trtserver /opt/tensorrtserver/bin/trtserver - -Similarly, within the container you can perform an incremental build -of the C++ and Python client libraries and example executables with:: - - # cd /workspace - # bazel build -c opt --config=cuda src/clients/… - # mkdir -p /opt/tensorrtserver/bin - # cp bazel-bin/src/clients/c++/image_client /opt/tensorrtserver/bin/. - # cp bazel-bin/src/clients/c++/perf_client /opt/tensorrtserver/bin/. - # cp bazel-bin/src/clients/c++/simple_client /opt/tensorrtserver/bin/. - # mkdir -p /opt/tensorrtserver/lib - # cp bazel-bin/src/clients/c++/librequest.so /opt/tensorrtserver/lib/. - # cp bazel-bin/src/clients/c++/librequest.a /opt/tensorrtserver/lib/. - # mkdir -p /opt/tensorrtserver/pip - # bazel-bin/src/clients/python/build_pip /opt/tensorrtserver/pip/. - -Some source changes seem to cause bazel to get confused and not -correctly rebuild all required sources. You can force bazel to rebuild -all of the TRTIS source without requiring a complete rebuild of the -TensorFlow and Caffe2 components by doing the following before issuing -the above build command:: - - # rm -fr bazel-bin/src - -.. include:: client.rst - :start-after: build-client-begin-marker-do-not-remove - :end-before: build-client-end-marker-do-not-remove - -Building the Documentation --------------------------- - -The TRTIS documentation is found in the docs/ directory and is based -on `Sphinx `_. `Doxygen -`_ integrated with `Exhale -`_ is used for C++ API -docuementation. - -To build the docs install the required dependencies:: - - $ apt-get update - $ apt-get install -y --no-install-recommends doxygen - $ pip install --upgrade sphinx sphinx-rtd-theme nbsphinx exhale - -To get the Python client library API docs the TensorRT Inference -Server Python package must be installed:: - - $ pip install --upgrade tensorrtserver-*.whl - -Then use Sphinx to build the documentation into the build/html -directory:: - - $ cd docs - $ make clean html diff --git a/docs/client.rst b/docs/client.rst deleted file mode 100644 index 0335b32cbd..0000000000 --- a/docs/client.rst +++ /dev/null @@ -1,383 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -.. _section-client-libraries-and-examples: - -Client Libraries and Examples -============================= - -The TRTIS *client libraries* make it easy to communicate with the -TensorRT Inference Server from you C++ or Python application. Using -these libraries you can send either HTTP or GRPC requests to TRTIS to -check server status or health and to make inference requests. - -A couple of example applications show how to use the client libraries -to perform image classification and to test performance: - -* C++ and Python versions of *image\_client*, an example application - that uses the C++ or Python client library to execute image - classification models on the TensorRT Inference Server. - -* Python version of *grpc\_image\_client*, an example application that - is functionally equivalent to *image\_client* but that uses GRPC - generated client code to communicate with TRTIS (instead of the - client library). - -* C++ version of *perf\_client*, an example application that issues a - large number of concurrent requests to TRTIS to measure latency and - throughput for a given model. You can use this to experiment with - different model configuration settings for your models. - -.. build-client-begin-marker-do-not-remove - -.. _section-building-the-client-libraries-and-examples: - -Building the Client Libraries and Examples ------------------------------------------- - -The provided Dockerfile can be used to build just the client libraries -and examples. Issue the following command to build the C++ client -library, C++ and Python examples, and a Python wheel file for the -Python client library:: - - $ docker build -t tensorrtserver_clients --target trtserver_build --build-arg "PYVER=" --build-arg "BUILD_CLIENTS_ONLY=1" . - -The -\\-build-arg setting PYVER is optional and can be used to set the -Python version that you want the Python client library built for (the -default is 3.5). - -After the build completes, the easiest way to extract the built -libraries and examples from the docker image is to mount a host -directory and then copy them out from within the container:: - - $ docker run -it --rm -v/tmp:/tmp/host tensorrtserver_clients - # cp /opt/tensorrtserver/bin/image_client /tmp/host/. - # cp /opt/tensorrtserver/bin/perf_client /tmp/host/. - # cp /opt/tensorrtserver/bin/simple_client /tmp/host/. - # cp /opt/tensorrtserver/pip/tensorrtserver-*.whl /tmp/host/. - # cp /opt/tensorrtserver/lib/librequest.* /tmp/host/. - -You can now access the files from /tmp on the host system. To run the -C++ examples you must install some dependencies on your host system:: - - $ apt-get install curl libcurl3-dev libopencv-dev libopencv-core-dev python-pil - -To run the Python examples you will need to additionally install the -client whl file and some other dependencies:: - - $ apt-get install python3 python3-pip - $ pip3 install --user --upgrade tensorrtserver-*.whl pillow - -.. build-client-end-marker-do-not-remove - -.. _section-image_classification_example: - -Image Classification Example Application ----------------------------------------- - -The image classification example that uses the C++ client API is -available at `src/clients/c++/image\_client.cc -`_. The -Python version of the image classification client is available at -`src/clients/python/image\_client.py -`_. - -To use image\_client (or image\_client.py) you must first have a -running TRTIS that is serving one or more image classification -models. The image\_client application requires that the model have a -single image input and produce a single classification output. If you -don't have a model repository with image classification models see -:ref:`section-example-model-repository` for instructions on how to -create one. - -Follow the instructions in :ref:`section-running-the-inference-server` -to launch TRTIS using the model repository. Once the server is running -you can use the image\_client application to send inference requests -to the server. You can specify a single image or a directory holding -images. Here we send a request for the resnet50_netdef model from the -:ref:`example model repository ` for -an image from the `qa/images -`_ -directory:: - - $ image_client -m resnet50_netdef -s INCEPTION qa/images/mug.jpg - Request 0, batch size 1 - Image '../qa/images/mug.jpg': - 504 (COFFEE MUG) = 0.723991 - -The Python version of the application accepts the same command-line -arguments:: - - $ src/clients/python/image_client.py -m resnet50_netdef -s INCEPTION qa/images/mug.jpg - Request 0, batch size 1 - Image '../qa/images/mug.jpg': - 504 (COFFEE MUG) = 0.778078556061 - -The image\_client and image\_client.py applications use the TRTIS -client library to talk to the server. By default image\_client -instructs the client library to use HTTP protocol to talk to TRTIS, -but you can use GRPC protocol by providing the \-i flag. You must also -use the \-u flag to point at the GRPC endpoint on TRTIS:: - - $ image_client -i grpc -u localhost:8001 -m resnet50_netdef -s INCEPTION qa/images/mug.jpg - Request 0, batch size 1 - Image '../qa/images/mug.jpg': - 504 (COFFEE MUG) = 0.723991 - -By default the client prints the most probable classification for the -image. Use the \-c flag to see more classifications:: - - $ image_client -m resnet50_netdef -s INCEPTION -c 3 qa/images/mug.jpg - Request 0, batch size 1 - Image '../qa/images/mug.jpg': - 504 (COFFEE MUG) = 0.723991 - 968 (CUP) = 0.270953 - 967 (ESPRESSO) = 0.00115996 - -The \-b flag allows you to send a batch of images for inferencing. -The image\_client application will form the batch from the image or -images that you specified. If the batch is bigger than the number of -images then image\_client will just repeat the images to fill the -batch:: - - $ image_client -m resnet50_netdef -s INCEPTION -c 3 -b 2 qa/images/mug.jpg - Request 0, batch size 2 - Image '../qa/images/mug.jpg': - 504 (COFFEE MUG) = 0.778078556061 - 968 (CUP) = 0.213262036443 - 967 (ESPRESSO) = 0.00293014757335 - Image '../qa/images/mug.jpg': - 504 (COFFEE MUG) = 0.778078556061 - 968 (CUP) = 0.213262036443 - 967 (ESPRESSO) = 0.00293014757335 - -Provide a directory instead of a single image to perform inferencing -on all images in the directory:: - - $ image_client -m resnet50_netdef -s INCEPTION -c 3 -b 2 qa/images - Request 0, batch size 2 - Image '../qa/images/car.jpg': - 817 (SPORTS CAR) = 0.836187 - 511 (CONVERTIBLE) = 0.0708251 - 751 (RACER) = 0.0597549 - Image '../qa/images/mug.jpg': - 504 (COFFEE MUG) = 0.723991 - 968 (CUP) = 0.270953 - 967 (ESPRESSO) = 0.00115996 - Request 1, batch size 2 - Image '../qa/images/vulture.jpeg': - 23 (VULTURE) = 0.992326 - 8 (HEN) = 0.00231854 - 84 (PEACOCK) = 0.00201471 - Image '../qa/images/car.jpg': - 817 (SPORTS CAR) = 0.836187 - 511 (CONVERTIBLE) = 0.0708251 - 751 (RACER) = 0.0597549 - -The grpc\_image\_client.py application at available at -`src/clients/python/grpc\_image\_client.py -`_ -behaves the same as the image\_client except that instead of using the -TRTIS client library it uses the GRPC generated client library to -communicate with TRTIS. - -Performance Example Application -------------------------------- - -The perf\_client example application located at -`src/clients/c++/perf\_client.cc -`_ -uses the C++ client API to send concurrent requests to TRTIS to -measure latency and inferences per second under varying client loads. - -To use perf\_client you must first have a running TRTIS that is -serving one or more models. The perf\_client application works with -any type of model by sending random data for all input tensors and by -reading and ignoring all output tensors. If you don't have a model -repository see :ref:`section-example-model-repository` for -instructions on how to create one. - -Follow the instructions in :ref:`section-running-the-inference-server` -to launch TRTIS using the model repository. - -The perf\_client application has two major modes. In the first mode -you specify how many concurrent clients you want to simulate and -perf\_client finds a stable latency and inferences/second for that -level of concurrency. Use the \-t flag to control concurrency and \-v -to see verbose output. The following example simulates four clients -continuously sending requests to TRTIS:: - - $ perf_client -m resnet50_netdef -p3000 -t4 -v - *** Measurement Settings *** - Batch size: 1 - Measurement window: 3000 msec - - Request concurrency: 4 - Pass [1] throughput: 207 infer/sec. Avg latency: 19268 usec (std 910 usec) - Pass [2] throughput: 206 infer/sec. Avg latency: 19362 usec (std 941 usec) - Pass [3] throughput: 208 infer/sec. Avg latency: 19252 usec (std 841 usec) - Client: - Request count: 624 - Throughput: 208 infer/sec - Avg latency: 19252 usec (standard deviation 841 usec) - Avg HTTP time: 19224 usec (send 714 usec + response wait 18486 usec + receive 24 usec) - Server: - Request count: 749 - Avg request latency: 17886 usec (overhead 55 usec + queue 26 usec + compute 17805 usec) - -In the second mode perf\_client will generate an inferences/second -vs. latency curve by increasing concurrency until a specific latency -limit or concurrency limit is reached. This mode is enabled by using -the \-d option and \-l to specify the latency limit and optionally the -\-c to specify a maximum concurrency limit:: - - $ perf_client -m resnet50_netdef -p3000 -d -l50 -c 3 - *** Measurement Settings *** - Batch size: 1 - Measurement window: 3000 msec - Latency limit: 50 msec - Concurrency limit: 3 concurrent requests - - Request concurrency: 1 - Client: - Request count: 327 - Throughput: 109 infer/sec - Avg latency: 9191 usec (standard deviation 822 usec) - Avg HTTP time: 9188 usec (send/recv 1007 usec + response wait 8181 usec) - Server: - Request count: 391 - Avg request latency: 7661 usec (overhead 90 usec + queue 68 usec + compute 7503 usec) - - Request concurrency: 2 - Client: - Request count: 521 - Throughput: 173 infer/sec - Avg latency: 11523 usec (standard deviation 616 usec) - Avg HTTP time: 11448 usec (send/recv 711 usec + response wait 10737 usec) - Server: - Request count: 629 - Avg request latency: 10018 usec (overhead 70 usec + queue 41 usec + compute 9907 usec) - - Request concurrency: 3 - Client: - Request count: 580 - Throughput: 193 infer/sec - Avg latency: 15518 usec (standard deviation 635 usec) - Avg HTTP time: 15487 usec (send/recv 779 usec + response wait 14708 usec) - Server: - Request count: 697 - Avg request latency: 14083 usec (overhead 59 usec + queue 30 usec + compute 13994 usec) - - Inferences/Second vs. Client Average Batch Latency - Concurrency: 1, 109 infer/sec, latency 9191 usec - Concurrency: 2, 173 infer/sec, latency 11523 usec - Concurrency: 3, 193 infer/sec, latency 15518 usec - -Use the \-f flag to generate a file containing CSV output of the -results:: - - $ perf_client -m resnet50_netdef -p3000 -d -l50 -c 3 -f perf.csv - -You can then import the CSV file into a spreadsheet to help visualize -the latency vs inferences/second tradeoff as well as see some -components of the latency. Follow these steps: - -- Open `this spreadsheet `_ -- Make a copy from the File menu "Make a copy..." -- Open the copy -- Select the A2 cell -- From the File menu select "Import..." -- Select "Upload" and upload the file -- Select "Replace data at selected cell" and then select the "Import data" button - -.. _section-client-api: - -Client API ----------- - -The C++ client API exposes a class-based interface for querying server -and model status and for performing inference. The commented interface -is available at `src/clients/c++/request.h -`_ -and in the API Reference. - -The Python client API provides similar capabilities as the C++ -API. The commented interface is available at -`src/clients/python/\_\_init\_\_.py -`_ -and in the API Reference. - -A very simple C++ example application at -`src/clients/c++/simple\_client.cc -`_ -and a Python version at `src/clients/python/simple\_client.py -`_ -demonstrate basic client API usage. - -To run the the C++ version of the simple example, first build as -described in :ref:`section-building-the-client-libraries-and-examples` -and then:: - - $ simple_client - 0 + 1 = 1 - 0 - 1 = -1 - 1 + 1 = 2 - 1 - 1 = 0 - 2 + 1 = 3 - 2 - 1 = 1 - 3 + 1 = 4 - 3 - 1 = 2 - 4 + 1 = 5 - 4 - 1 = 3 - 5 + 1 = 6 - 5 - 1 = 4 - 6 + 1 = 7 - 6 - 1 = 5 - 7 + 1 = 8 - 7 - 1 = 6 - 8 + 1 = 9 - 8 - 1 = 7 - 9 + 1 = 10 - 9 - 1 = 8 - 10 + 1 = 11 - 10 - 1 = 9 - 11 + 1 = 12 - 11 - 1 = 10 - 12 + 1 = 13 - 12 - 1 = 11 - 13 + 1 = 14 - 13 - 1 = 12 - 14 + 1 = 15 - 14 - 1 = 13 - 15 + 1 = 16 - 15 - 1 = 14 - -To run the the Python version of the simple example, first build as -described in :ref:`section-building-the-client-libraries-and-examples` -and install the tensorrtserver whl, then:: - - $ python src/clients/python/simple_client.py diff --git a/docs/conf.py b/docs/conf.py old mode 100644 new mode 100755 index 917cb7ea71..505af4351d --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,13 +26,11 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -*- coding: utf-8 -*- -# # Configuration file for the Sphinx documentation builder. # -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- @@ -38,223 +38,251 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('..')) -from builtins import str import os -import re -import sphinx_rtd_theme -import subprocess -import textwrap - -# -- Project information ----------------------------------------------------- - -project = u'NVIDIA TensorRT Inference Server' -copyright = u'2018, NVIDIA Corporation' -author = u'NVIDIA Corporation' - -version_long = u'0.0.0' -with open("../VERSION") as f: - version_long = f.readline() -version_short = re.match('^[\d]+\.[\d]+', version_long).group(0) +from docutils import nodes +from sphinx import search -git_sha = os.getenv("GIT_SHA") +# import sys +# sys.path.insert(0, os.path.abspath('.')) -if not git_sha: - try: - git_sha = subprocess.check_output(["git", "log", "--pretty=format:'%h'", "-n1"]).decode('ascii').replace("'","").strip() - except: - git_sha = u'0000000' +# -- Project information ----------------------------------------------------- -git_sha = git_sha[:7] if len(git_sha) > 7 else git_sha +project = "NVIDIA Triton Inference Server" +copyright = "2018-2024, NVIDIA Corporation" +author = "NVIDIA" -version = str(version_long + u"-" + git_sha) # The full version, including alpha/beta/rc tags -release = str(version_long) +# Env only set during riva-release process, otherwise keep as dev for all internal builds +release = os.getenv("TRITON_VERSION", "dev") -# hack: version is used for html creation, so put the version picker -# link here as well: -version = version + """
-Version select: """ +# maintain left-side bar toctrees in `contents` file +# so it doesn't show up needlessly in the index page +master_doc = "contents" # -- General configuration --------------------------------------------------- -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.mathjax', - 'sphinx.ext.napoleon', - 'sphinx.ext.ifconfig', - 'sphinx.ext.extlinks', - 'nbsphinx', - 'breathe', - 'exhale' + "ablog", + "myst_parser", + "sphinx_copybutton", + "sphinx_design", + "sphinx-prompt", + # "sphinxcontrib.bibtex", + "sphinx_tabs.tabs", + "sphinx_sitemap", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", + "sphinx.ext.ifconfig", + "sphinx.ext.extlinks", ] -# Add any paths that contain templates here, relative to this directory. -templates_path = ['templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path . -exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] +suppress_warnings = ["myst.domains", "ref.ref", "myst.header"] -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +source_suffix = [".rst", ".md"] -# Setup the breathe extension -breathe_projects = { - "BreatheTRTIS": "./doxyoutput/xml" -} -breathe_default_project = "BreatheTRTIS" - -# Setup the exhale extension -exhale_args = { - # These arguments are required - "containmentFolder": "./cpp_api", - "rootFileName": "cpp_api_root.rst", - "rootFileTitle": "C++ API", - "doxygenStripFromPath": "..", - # Suggested optional arguments - "createTreeView": True, - # TIP: if using the sphinx-bootstrap-theme, you need - # "treeViewIsBootstrap": True, - "exhaleExecutesDoxygen": True, - "exhaleDoxygenStdin": textwrap.dedent(''' - JAVADOC_AUTOBRIEF = YES - INPUT = ../src/clients/c++/request.h - ''') +autodoc_default_options = { + "members": True, + "undoc-members": True, + "private-members": True, } -# Tell sphinx what the primary language being documented is. -#primary_domain = 'cpp' +autosummary_generate = True +autosummary_mock_imports = [ + "tritonclient.grpc.model_config_pb2", + "tritonclient.grpc.service_pb2", + "tritonclient.grpc.service_pb2_grpc", +] -# Tell sphinx what the pygments highlight language should be. -highlight_language = 'text' +napoleon_include_special_with_doc = True + +numfig = True + +# final location of docs for seo/sitemap +html_baseurl = ( + "https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/" +) + +myst_enable_extensions = [ + "dollarmath", + "amsmath", + "deflist", + # "html_admonition", + "html_image", + "colon_fence", + # "smartquotes", + "replacements", + # "linkify", + "substitution", +] +myst_heading_anchors = 5 +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["README.md", "examples/README.md", "user_guide/perf_analyzer.md"] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -html_theme_options = { - 'canonical_url': 'https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/index.html', - 'collapse_navigation': False, - 'display_version': True, - 'logo_only': False, -} +html_theme = "sphinx_book_theme" +html_logo = "_static/nvidia-logo-horiz-rgb-blk-for-screen.png" +html_title = "NVIDIA Triton Inference Server" +html_short_title = "Triton" +html_copy_source = True +html_sourcelink_suffix = "" +html_favicon = "_static/nvidia-logo-vert-rgb-blk-for-screen.png" +html_last_updated_fmt = "" +html_additional_files = ["index.html"] # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = 'NVIDIATRTISdoc' - - -# -- Options for LaTeX output ------------------------------------------------ +html_static_path = ["_static"] +html_css_files = ["custom.css"] -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', +html_theme_options = { + "path_to_docs": "docs", + # "launch_buttons": { + # "binderhub_url": "https://mybinder.org", + # "colab_url": "https://colab.research.google.com/", + # "deepnote_url": "https://deepnote.com/", + # "notebook_interface": "jupyterlab", + # "thebe": True, + # # "jupyterhub_url": "https://datahub.berkeley.edu", # For testing + # }, + "use_edit_page_button": False, + "use_issues_button": True, + "use_repository_button": True, + "use_download_button": False, + "logo_only": False, + "show_toc_level": 2, + "extra_navbar": "", + "extra_footer": """ + Privacy Policy | + Manage My Privacy | + Do Not Sell or Share My + Data | + Terms of Service | + Accessibility | + Corporate Policies | + Product Security | + Contact""", + "repository_url": "https://github.com/triton-inference-server/server", + "use_repository_button": True, } -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'NVIDIATRTIS.tex', u'NVIDIA TensorRT Inference Server Documentation', - u'NVIDIA Corporation', 'manual'), -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'nvidiatrtis', u'NVIDIA TensorRT Inference Server Documentation', - [author], 1) -] +version_short = release +deploy_ngc_org = "nvidia" +deploy_ngc_team = "triton" +myst_substitutions = { + "VersionNum": version_short, + "deploy_ngc_org_team": f"{deploy_ngc_org}/{deploy_ngc_team}" + if deploy_ngc_team + else deploy_ngc_org, +} -# -- Options for Texinfo output ---------------------------------------------- +def ultimateReplace(app, docname, source): + result = source[0] + for key in app.config.ultimate_replacements: + result = result.replace(key, app.config.ultimate_replacements[key]) + source[0] = result -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'NVIDIATRTIS', u'NVIDIA TensorRT Inference Server Documentation', - author, 'NVIDIATRTIS', 'One line description of project.', - 'Miscellaneous'), -] +# this is a necessary hack to allow us to fill in variables that exist in code blocks +ultimate_replacements = { + "{VersionNum}": version_short, + "{SamplesVersionNum}": version_short, + "{NgcOrgTeam}": f"{deploy_ngc_org}/{deploy_ngc_team}" + if deploy_ngc_team + else deploy_ngc_org, +} -# -- Extension configuration ------------------------------------------------- -extlinks = {'issue': ('https://github.com/NVIDIA/tensorrt-inference-server/issues/%s', - 'issue '), - 'fileref': ('https://github.com/NVIDIA/tensorrt-inference-server/tree/' + - (git_sha if git_sha != u'0000000' else "master") + '/%s', ''),} +# bibtex_bibfiles = ["references.bib"] +# To test that style looks good with common bibtex config +# bibtex_reference_style = "author_year" +# bibtex_default_style = "plain" + +### We currently use Myst: https://myst-nb.readthedocs.io/en/latest/use/execute.html +nb_execution_mode = "off" # Global execution disable +# execution_excludepatterns = ['tutorials/tts-python-basics.ipynb'] # Individual notebook disable + + +def setup(app): + app.add_config_value("ultimate_replacements", {}, True) + app.connect("source-read", ultimateReplace) + app.add_js_file("https://js.hcaptcha.com/1/api.js") + + visitor_script = ( + "//assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" + ) + + if visitor_script: + app.add_js_file(visitor_script) + + # if not os.environ.get("READTHEDOCS") and not os.environ.get("GITHUB_ACTIONS"): + # app.add_css_file( + # "https://assets.readthedocs.org/static/css/readthedocs-doc-embed.css" + # ) + # app.add_css_file("https://assets.readthedocs.org/static/css/badge_only.css") + + # # Create the dummy data file so we can link it + # # ref: https://github.com/readthedocs/readthedocs.org/blob/bc3e147770e5740314a8e8c33fec5d111c850498/readthedocs/core/static-src/core/js/doc-embed/footer.js # noqa: E501 + # app.add_js_file("rtd-data.js") + # app.add_js_file( + # "https://assets.readthedocs.org/static/javascript/readthedocs-doc-embed.js", + # priority=501, + # ) + + +# Patch for sphinx.search stemming short terms (i.e. tts -> tt) +# https://github.com/sphinx-doc/sphinx/blob/4.5.x/sphinx/search/__init__.py#L380 +def sphinxSearchIndexFeed( + self, docname: str, filename: str, title: str, doctree: nodes.document +): + """Feed a doctree to the index.""" + self._titles[docname] = title + self._filenames[docname] = filename + + visitor = search.WordCollector(doctree, self.lang) + doctree.walk(visitor) + + # memoize self.lang.stem + def stem(word: str) -> str: + try: + return self._stem_cache[word] + except KeyError: + self._stem_cache[word] = self.lang.stem(word).lower() + return self._stem_cache[word] + + _filter = self.lang.word_filter + + for word in visitor.found_title_words: + stemmed_word = stem(word) + if len(stemmed_word) > 3 and _filter(stemmed_word): + self._title_mapping.setdefault(stemmed_word, set()).add(docname) + elif _filter(word): # stemmer must not remove words from search index + self._title_mapping.setdefault(word.lower(), set()).add(docname) + + for word in visitor.found_words: + stemmed_word = stem(word) + # again, stemmer must not remove words from search index + if len(stemmed_word) <= 3 or not _filter(stemmed_word) and _filter(word): + stemmed_word = word.lower() + already_indexed = docname in self._title_mapping.get(stemmed_word, set()) + if _filter(stemmed_word) and not already_indexed: + self._mapping.setdefault(stemmed_word, set()).add(docname) + + +search.IndexBuilder.feed = sphinxSearchIndexFeed diff --git a/docs/contents.md b/docs/contents.md new file mode 100644 index 0000000000..5aaafa7afa --- /dev/null +++ b/docs/contents.md @@ -0,0 +1,156 @@ + + +```{toctree} +:maxdepth: 1 +:caption: Getting Started + +getting_started/quickstart +``` + +```{toctree} +:maxdepth: 1 +:caption: User Guide + +user_guide/performance_tuning +user_guide/architecture +user_guide/model_repository +customization_guide/repository_agents +user_guide/model_configuration +user_guide/request_cancellation +user_guide/optimization +user_guide/ragged_batching +user_guide/rate_limiter +user_guide/model_analyzer +user_guide/model_management +user_guide/custom_operations +user_guide/decoupled_models +user_guide/response_cache +user_guide/metrics +user_guide/trace +user_guide/jetson +user_guide/v1_to_v2 +customization_guide/deploy +``` + +```{toctree} +:maxdepth: 1 +:caption: Debugging + +user_guide/debugging_guide +user_guide/faq +``` + +```{toctree} +:maxdepth: 1 +:caption: Protocol Guides + +protocol/README +customization_guide/inference_protocols +protocol/extension_binary_data +protocol/extension_classification +protocol/extension_generate +protocol/extension_logging +protocol/extension_model_configuration +protocol/extension_model_repository +protocol/extension_schedule_policy +protocol/extension_sequence +protocol/extension_shared_memory +protocol/extension_statistics +protocol/extension_trace +protocol/extension_parameters +``` + +```{toctree} +:maxdepth: 1 +:caption: Customization Guide + +customization_guide/build +customization_guide/compose +customization_guide/test +``` + +```{toctree} +:maxdepth: 1 +:caption: Examples + +examples/jetson/README +examples/jetson/concurrency_and_dynamic_batching/README +``` + +```{toctree} +:maxdepth: 1 +:caption: Client + +client/README +_reference/tritonclient_api.rst +client/src/java/README +client/src/grpc_generated/go/README +client/src/grpc_generated/javascript/README +client/src/grpc_generated/java/README +``` + +```{toctree} +:maxdepth: 1 +:caption: Performance Analyzer + +perf_analyzer/README +perf_analyzer/docs/README +perf_analyzer/docs/install +perf_analyzer/docs/quick_start +perf_analyzer/docs/cli +perf_analyzer/docs/inference_load_modes +perf_analyzer/docs/input_data +perf_analyzer/docs/measurements_metrics +perf_analyzer/docs/benchmarking +perf_analyzer/genai-perf/README +perf_analyzer/genai-perf/docs/compare +perf_analyzer/genai-perf/docs/embeddings +perf_analyzer/genai-perf/docs/files +perf_analyzer/genai-perf/docs/lora +perf_analyzer/genai-perf/docs/multi_modal +perf_analyzer/genai-perf/docs/rankings +perf_analyzer/genai-perf/docs/tutorial +perf_analyzer/genai-perf/examples/tutorial +``` + +```{toctree} +:maxdepth: 1 +:caption: Python Backend + +python_backend/README +python_backend/inferentia/README +python_backend/examples/auto_complete/README +python_backend/examples/bls/README +python_backend/examples/bls_decoupled/README +python_backend/examples/custom_metrics/README +python_backend/examples/decoupled/README +python_backend/examples/instance_kind/README +python_backend/examples/jax/README +python_backend/examples/preprocessing/README +``` diff --git a/docs/contribute.rst b/docs/contribute.rst deleted file mode 100644 index edc1bc0e3a..0000000000 --- a/docs/contribute.rst +++ /dev/null @@ -1,45 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Contributing -============ - -Contributions to TensorRT Inference Server are more than welcome. To -contribute make a pull request and follow the guidelines outlined in -the `CONTRIBUTING -`_ -document. - -Coding Convention ------------------ - -Use clang-format to format all source files (\*.h, \*.cc, \*.proto) to -a consistent format. You should run clang-format on all source files -before submitting a pull request:: - - $ apt-get install clang-format clang-format-6.0 - $ clang-format-6.0 --style=file -i *.proto *.cc *.h diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md new file mode 100644 index 0000000000..56e5875776 --- /dev/null +++ b/docs/customization_guide/build.md @@ -0,0 +1,521 @@ + + +# Building Triton + +This section describes how to build the Triton server from source. For +information on building the Triton client libraries and examples see +[Client Libraries and +Examples](https://github.com/triton-inference-server/client). For +information on building the Triton SDK container see [Build SDK +Image](test.md#build-sdk-image). For information on testing your +Triton build see [Testing Triton](test.md). + +You can create a customized Triton Docker image that contains a subset +of the released backends without building from source. For example, +you may want a Triton image that contains only the TensorRT and Python +backends. For this type of customization you don't need to build +Triton from source and instead can use [the *compose* +utility](compose.md). + +The Triton source is distributed across multiple GitHub repositories +that together can be built and installed to create a complete Triton +installation. Triton server is built using CMake and (optionally) +Docker. To simplify the build process, Triton provides a +[build.py](https://github.com/triton-inference-server/server/blob/main/build.py) script. +The build.py script will generate the CMake and Docker build steps required to +build Triton, and will optionally invoke those steps or leave the invocation to +you, as described below. + +The build.py script currently supports building Triton for the +following platforms. See [Building on Unsupported +Platforms](#building-on-unsupported-platforms) if you are attempting +to build Triton on a platform that is not listed here. + +* [Ubuntu 22.04, x86-64](#building-for-ubuntu-2204) + +* [Jetpack 4.x, NVIDIA Jetson (Xavier, Nano, TX2)](#building-for-jetpack-4x) + +* [Windows 10, x86-64](#building-for-windows-10) + +If you are developing or debugging Triton, see [Development and +Incremental Builds](#development-and-incremental-builds) for information +on how to perform incremental build. + +## Building for Ubuntu 22.04 + +For Ubuntu-22.04, build.py supports both a Docker build and a +non-Docker build. + +* [Build using Docker](#building-with-docker) and the TensorFlow and PyTorch + Docker images from [NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com). + +* [Build without Docker](#building-without-docker). + +### Building With Docker + +The easiest way to build Triton is to use Docker. The result of the +build will be a Docker image called *tritonserver* that will contain +the tritonserver executable in /opt/tritonserver/bin and the required +shared libraries in /opt/tritonserver/lib. The backends and +repository-agents built for Triton will be in +/opt/tritonserver/backends and /opt/tritonserver/repoagents, +respectively. + +The first step for the build is to clone the +[triton-inference-server/server](https://github.com/triton-inference-server/server) +repo branch for the release you are interested in building (or the +*main* branch to build from the development branch). Then run build.py +as described below. The build.py script performs these steps when +building with Docker. + +* In the *build* subdirectory of the server repo, generate the + docker_build script, the cmake_build script and the Dockerfiles + needed to build Triton. If you use the --dryrun flag, build.py will + stop here so that you can examine these files. + +* Run the docker_build script to perform the Docker-based build. The + docker_build script performs the following steps. + + * Build the *tritonserver_buildbase* Docker image that collects all + the build dependencies needed to build Triton. The + *tritonserver_buildbase* image is based on a minimal/base + image. When building with GPU support (--enable-gpu), the *min* + image is the + [\-py3-min](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) + image pulled from [NGC](https://ngc.nvidia.com) that contains the + CUDA, cuDNN, TensorRT and other dependencies that are required to + build Triton. When building without GPU support, the *min* image + is the standard ubuntu:22.04 image. + + * Run the cmake_build script within the *tritonserver_buildbase* + image to actually build Triton. The cmake_build script performs + the following steps. + + * Invoke CMake in the server repo to build Triton's core shared + library and *tritonserver* executable. + + * Clone each requested backend and build it using CMake. For + example, the ONNX Runtime backend is built using + [triton-inference-server/onnxruntime_backend/CMakeLists.txt](https://github.com/triton-inference-server/onnxruntime_backend/blob/main/CMakeLists.txt). Some + of the backends may use Docker as part of their build (for + example [ONNX + Runtime](https://github.com/triton-inference-server/onnxruntime_backend) + and + [OpenVINO](https://github.com/triton-inference-server/openvino_backend)). If + you don't want to use Docker in those cases you must consult the + build process for those backends. + + * Clone each repository agent and build it using the CMake file + from the corresponding repo. For example, the + [Checksum](https://github.com/triton-inference-server/checksum_repository_agent) + repository agent is built using + [triton-inference-server/checksum_repository_agent/CMakeLists.txt](https://github.com/triton-inference-server/checksum_repository_agent/blob/main/CMakeLists.txt). + + * Copy the built artifacts out of the container and into the build + subdirectory on the host system. + + * Create the final *tritonserver* Docker image that contains the + libraries, executables and other artifacts from the build. + + * Create a *tritonserver_cibase* Docker image that contains the QA + artifacts needed for testing, as described in [Testing + Triton](test.md). + +By default, build.py does not enable any of Triton's optional features +but you can enable all features, backends, and repository agents with +the --enable-all flag. The -v flag turns on verbose output. + +```bash +$ ./build.py -v --enable-all +``` + +If you want to enable only certain Triton features, backends and +repository agents, do not specify --enable-all. Instead you must +specify the individual flags as documented by --help. + +#### Building With Specific GitHub Branches + +As described above, the build is performed in the server repo, but +source from several other repos is fetched during the build +process. Typically you do not need to specify anything about these +other repos, but if you want to control which branch is used in these +other repos you can as shown in the following example. + +```bash +$ ./build.py ... --repo-tag=common: --repo-tag=core: --repo-tag=backend: --repo-tag=thirdparty: ... --backend=tensorrt: ... --repoagent=checksum: ... +``` + +If you are building on a release branch then `` will +default to the branch name. For example, if you are building on the +r24.09 branch, `` will default to r24.09. If you are +building on any other branch (including the *main* branch) then +`` will default to "main". Therefore, you typically do +not need to provide `` at all (nor the preceding +colon). You can use a different `` for a component to +instead use the corresponding branch/tag in the build. For example, if +you have a branch called "mybranch" in the +[onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend) +repo that you want to use in the build, you would specify +--backend=onnxruntime:mybranch. + +#### CPU-Only Build + +If you want to build without GPU support you must specify individual +feature flags and not include the `--enable-gpu` and +`--enable-gpu-metrics` flags. Only the following backends are +available for a non-GPU / CPU-only build: `identity`, `repeat`, `ensemble`, +`square`, `tensorflow2`, `pytorch`, `onnxruntime`, `openvino`, +`python` and `fil`. + +To include the TensorFlow2 backend in your CPU-only build, you must +provide this additional flag to build.py: +`--extra-backend-cmake-arg=tensorflow2:TRITON_TENSORFLOW_INSTALL_EXTRA_DEPS=ON`. + +CPU-only builds of the TensorFlow and PyTorch backends require some CUDA stubs +and runtime dependencies that are not present in the CPU-only base container. +These are retrieved from a GPU base container, which can be changed with the +`--image=gpu-base,nvcr.io/nvidia/tritonserver:-py3-min` flag. + +### Building Without Docker + +To build Triton without using Docker you must install the build +dependencies that are handled automatically when building with Docker. + +The first step for the build is to clone the +[triton-inference-server/server](https://github.com/triton-inference-server/server) +repo branch for the release you are interested in building (or the +*main* branch to build from the development branch). + +To determine what dependencies are required by the build, run build.py +with the --dryrun flag, and then looking in the build subdirectory at +Dockerfile.buildbase. + +```bash +$ ./build.py -v --enable-all +``` + +From Dockerfile.buildbase you can see what dependencies you need to +install on your host system. Note that when building with --enable-gpu +(or --enable-all), Dockerfile.buildbase depends on the +[\-py3-min](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +image pulled from [NGC](https://ngc.nvidia.com). Unfortunately, a +Dockerfile is not currently available for the +[\-py3-min](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +image. Instead, you must manually install [CUDA and +cuDNN](#cuda-cublas-cudnn) and [TensorRT](#tensorrt) dependencies as +described below. + +Once you have installed these dependencies on your build system you +can then use build.py with the --no-container-build flag to build +Triton. + +```bash +$ ./build.py -v --no-container-build --build-dir=`pwd`/build --enable-all +``` + +See [Building with Docker](#building-with-docker) for more details on how the +cmake_build script is used to perform the build. + +#### CUDA, cuBLAS, cuDNN + +For Triton to support NVIDIA GPUs you must install CUDA, cuBLAS and +cuDNN. These libraries must be installed on the system include and +library paths so that they are available for the build. The version of +the libraries used for a given release can be found in the [Framework +Containers Support +Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html). + +For a given version of Triton you can attempt to build with +non-supported versions of the libraries but you may have build or +execution issues since non-supported versions are not tested. + +#### TensorRT + +The TensorRT headers and libraries must be installed on system include +and library paths so that they are available for the build. The +version of TensorRT used in a given release can be found in the +[Framework Containers Support +Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html). + +For a given version of Triton you can attempt to build with +non-supported versions of TensorRT but you may have build or execution +issues since non-supported versions are not tested. + +## Building for JetPack 4.x + +*Under Construction* + +## Building for Windows 10 + +For Windows 10, build.py supports both a Docker build and a non-Docker +build in a similar way as described for [Ubuntu](#building-for-ubuntu-2204). The primary +difference is that the minimal/base image used as the base of +Dockerfile.buildbase image can be built from the provided +[Dockerfile.win10.min](https://github.com/triton-inference-server/server/blob/main/Dockerfile.win10.min) +file as described in [Windows 10 "Min" Image](#windows-10-min-image). When running build.py +use the --image flag to specify the tag that you assigned to this +image. For example, --image=base,win10-py3-min. + +### Windows and Docker + +Depending on your version of Windows 10 and your version of Docker you +may need to perform these additional steps before any of the following +step. + +* Set your Docker to work with "Windows containers". Right click on + the whale icon in the lower-right status area and select "Switch to + Windows containers". + +### Windows 10 "Min" Image + +The "min" container describes the base dependencies needed to perform +the Windows build. The Windows min container is +[Dockerfile.win10.min](https://github.com/triton-inference-server/server/blob/main/Dockerfile.win10.min). + +Before building the min container you must download the appropriate +cuDNN and TensorRT versions and place them in the same directory as +Dockerfile.win10.min. + +* For cuDNN the CUDNN_VERSION and CUDNN_ZIP arguments defined in + Dockerfile.win10.min indicate the version of cuDNN that your should + download from https://developer.nvidia.com/rdp/cudnn-download. + +* For TensorRT the TENSORRT_VERSION and TENSORRT_ZIP arguments defined + in Dockerfile.win10.min indicate the version of TensorRT that your + should download from + https://developer.nvidia.com/nvidia-tensorrt-download. + +After downloading the zip files for cuDNN and TensorRT, you build the +min container using the following command. + +```bash +$ docker build -t win10-py3-min -f Dockerfile.win10.min . +``` + +### Build Triton Server + +Triton is built using the build.py script. The build system must have +Docker, Python3 (plus pip installed *docker* module) and git installed +so that it can execute build.py and perform a docker build. By +default, build.py does not enable any of Triton's optional features +and so you must enable them explicitly. The following build.py +invocation builds all features and backends available on windows. + +```bash +python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common: --repo-tag=core: --repo-tag=backend: --repo-tag=thirdparty: --backend=ensemble --backend=tensorrt: --backend=onnxruntime: --backend=openvino: --backend=python: +``` + +If you are building on *main* branch then `` will +default to "main". If you are building on a release branch then +`` will default to the branch name. For example, if you +are building on the r24.09 branch, `` will default to +r24.09. Therefore, you typically do not need to provide `` at all (nor the preceding colon). You can use a different +`` for a component to instead use the corresponding +branch/tag in the build. For example, if you have a branch called +"mybranch" in the +[onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend) +repo that you want to use in the build, you would specify +--backend=onnxruntime:mybranch. + +### Extract Build Artifacts + +When build.py completes, a Docker image called *tritonserver* will +contain the built Triton Server executable, libraries and other +artifacts. Windows containers do not support GPU access so you likely +want to extract the necessary files from the tritonserver image and +run them directly on your host system. All the Triton artifacts can be +found in /opt/tritonserver directory of the tritonserver image. Your +host system will need to install the CUDA, cuDNN, TensorRT and other +dependencies that were used for the build. + +## Building on Unsupported Platforms + +Building for an unsupported OS and/or hardware platform is +possible. All of the build scripting, Dockerfiles and CMake +invocations are included in the public repos or are generated by +build.py as described in [Building with Docker](#building-with-docker). From +these files you can find the required dependencies and CMake +invocations. However, due to differences in compilers, libraries, +package management, etc. you may have to make changes in the build +scripts, Dockerfiles, CMake files and the source code. + +To see the generated build scripts and Dockerfiles referred to below, +use: + +```bash +$ ./build.py -v --enable-all --dryrun +``` + +You should familiarize yourself with the build process for supported +platforms by reading the above documentation and then follow the +process for the supported platform that most closely matches the +platform you are interested in (for example, if you are trying to +build for RHEL/x86-64 then follow the [Building for Ubuntu +22.04](#building-for-ubuntu-2204) process. You will likely need to +make changes in the following areas and then manually run docker_build +and cmake_build or the equivalent commands to perform a build. + +* The generated Dockerfiles install dependencies for the build using + platform-specific packaging tools, for example, apt-get for + Ubuntu. You will need to change build.py to use the packaging tool + appropriate for your platform. + +* The package and libraries names for your platform may differ from + those used by the generated Dockerfiles. You will need to find the + corresponding packages on libraries on your platform. + +* Your platform may use a different compiler or compiler version than + the support platforms. As a result you may encounter build errors + that need to be fixed by editing the source code or changing the + compilation flags. + +* Triton depends on a large number of open-source packages that it + builds from source. If one of these packages does not support your + platform then you may need to disable the Triton feature that + depends on that package. For example, Triton supports the S3 + filesystem by building the aws-sdk-cpp package. If aws-sdk-cpp + doesn't build for your platform then you can remove the need for + that package by not specifying --filesystem=s3 when you run + build.py. In general, you should start by running build.py with the + minimal required feature set. + +* The + [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend) + backend extracts pre-built shared libraries from the TensorFlow NGC + container as part of the build. This container is only available for + Ubuntu-22.04 / x86-64, so if you require the TensorFlow backend for + your platform you will need download the TensorFlow container and + modify its build to produce shared libraries for your platform. You + must use the TensorFlow source and build scripts from within the NGC + container because they contain Triton-specific patches that are + required for the Triton TensorFlow backend. + +* By default, the + [PyTorch](https://github.com/triton-inference-server/pytorch_backend) + backend build extracts pre-built shared libraries from The PyTorch + NGC container. But the build can also use PyTorch shared libraries + that you build separately for your platform. See the pytorch_backend + build process for details. + +## Development and Incremental Builds + +### Development Builds Without Docker + +If you are [building without Docker](#building-without-docker) use the +CMake invocation steps in cmake_build to invoke CMake to set-up a +build environment where you can invoke make/msbuild.exe to incremental +build the Triton core, a backend, or a repository agent. + +### Development Builds With Docker + +If you are [building with Docker](#building-with-docker), the generated +*tritonserver_buildbase* image contains all the dependencies needed to +perform a full or incremental build. Within *tritonserver_buildbase*, +/workspace/build/cmake_build contains the CMake invocations that are +used to build the Triton core, the backends, and the repository +agents. + +To perform an incremental build within the *tritonserver_buildbase* +container, map your source into the container and then run the +appropriate CMake and `make` (or `msbuild.exe`) steps from cmake_build +within the container. + +#### Development Build of Triton Core + +Assuming you have a clone of the [server +repo](https://github.com/triton-inference-server/server) on your host +system where you are making changes and you want to perform +incremental builds to test those changes. Your source code is in +/home/me/server. Run the *tritonserver_buildbase* container and map +your server source directory into the container at /server. + +``` +$ docker run -it --rm -v/home/me/server:/server tritonserver_buildbase bash +``` + +Look at /workspace/build/cmake_build within the container for the +section of commands that build "Triton core library". You can follow +those command exactly, or you can modify them to change the build +directory or the CMake options. You **must** change the CMake command +to use /server instead of /workspace as the location for the +CMakeLists.txt file and source: + +``` +$ cmake /server +``` + +Then you can change directory into the build directory and run `make` +(or `msbuild.exe`) as shown in cmake_build. As you make changes to the +source on your host system, you can perform incremental builds by +re-running `make` (or `msbuild.exe`). + +#### Development Build of Backend or Repository Agent + +Performing a full or incremental build of a backend or repository +agent is similar to building the Triton core. As an example we will +use the TensorRT backend. Assuming you have a clone of the [TensorRT +backend +repo](https://github.com/triton-inference-server/tensorrt_backend) on +your host system where you are making changes and you want to perform +incremental builds to test those changes. Your source code is in +/home/me/tritonserver_backend. Run the *tritonserver_buildbase* +container and map your TensorRT backend source directory into the +container at /tensorrt_backend. Note that some backends will use +Docker as part of their build, and so the host's Docker registry must +be made available within the *tritonserver_buildbase* by mounting +docker.sock (on Windows use +-v\\.\pipe\docker_engine:\\.\pipe\docker_engine). + +``` +$ docker run -it --rm -v/var/run/docker.sock:/var/run/docker.sock -v/home/me/tensorrt_backend:/tensorrt_backend tritonserver_buildbase bash +``` + +Look at /workspace/build/cmake_build within the container for the +section of commands that build "TensorRT backend". You can follow +those command exactly, or you can modify them to change the build +directory or the CMake options. You **must** change the CMake command +to use /tensorrt_backend instead of /workspace as the location for the +CMakeLists.txt file and source: + +``` +$ cmake /tensorrt_backend +``` + +Then you can change directory into the build directory and run `make` +(or `msbuild.exe`) as shown in cmake_build. As you make changes to the +source on your host system, you can perform incremental builds by +re-running `make` (or `msbuild.exe`). + +### Building with Debug Symbols + +To build with Debug symbols, use the --build-type=Debug argument while +launching build.py. If building directly with CMake use +-DCMAKE_BUILD_TYPE=Debug. You can then launch the built server with +gdb and see the debug symbols/information in the gdb trace. diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md new file mode 100644 index 0000000000..0c6afc1e0b --- /dev/null +++ b/docs/customization_guide/compose.md @@ -0,0 +1,147 @@ + + +# Customize Triton Container + +Two Docker images are available from [NVIDIA GPU Cloud +(NGC)](https://ngc.nvidia.com) that make it possible to easily +construct customized versions of Triton. By customizing Triton you can +significantly reduce the size of the Triton image by removing +functionality that you don't require. + +Currently the customization is limited as described below but future +releases will increase the amount of customization that is available. +It is also possible to [build Triton](build.md#building-triton) +from source to get more exact customization. + +## Use the compose.py script + +The `compose.py` script can be found in the +[server repository](https://github.com/triton-inference-server/server). +Simply clone the repository and run `compose.py` to create a custom container. +Note: Created container version will depend on the branch that was cloned. +For example branch + [r24.09](https://github.com/triton-inference-server/server/tree/r24.09) +should be used to create a image based on the NGC 24.09 Triton release. + +`compose.py` provides `--backend`, `--repoagent` options that allow you to +specify which backends and repository agents to include in the custom image. +For example, the following creates a new docker image that +contains only the Pytorch and Tensorflow backends and the checksum +repository agent. + +Example: +``` +python3 compose.py --backend pytorch --backend tensorflow --repoagent checksum +``` +will provide a container `tritonserver` locally. You can access the container +with +``` +$ docker run -it tritonserver:latest +``` + +Note: If `compose.py` is run on release versions `r21.08` and earlier, +the resulting container will have DCGM version 2.2.3 installed. +This may result in different GPU statistic reporting behavior. + +### Compose a specific version of Triton + +`compose.py` requires two containers: a `min` container which is the +base the compose container is built from and a `full` container from which the +script will extract components. The version of the `min` and `full` container +is determined by the branch of Triton `compose.py` is on. +For example, running +``` +python3 compose.py --backend pytorch --repoagent checksum +``` +on branch [r24.09](https://github.com/triton-inference-server/server/tree/r24.09) pulls: +- `min` container `nvcr.io/nvidia/tritonserver:24.09-py3-min` +- `full` container `nvcr.io/nvidia/tritonserver:24.09-py3` + +Alternatively, users can specify the version of Triton container to pull from +any branch by either: +1. Adding flag `--container-version ` to branch +``` +python3 compose.py --backend pytorch --repoagent checksum --container-version 24.09 +``` +2. Specifying `--image min, --image full,`. + The user is responsible for specifying compatible `min` and `full` containers. +``` +python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.09-py3-min --image full,nvcr.io/nvidia/tritonserver:24.09-py3 +``` +Method 1 and 2 will result in the same composed container. Furthermore, +`--image` flag overrides the `--container-version` flag when both are specified. + +Note: +1. All contents in `/opt/tritonserver` repository of the `min` image will be + removed to ensure dependencies of the composed image are added properly. +2. vLLM and TensorRT-LLM backends are currently not supported backends for +`compose.py`. If you want to build additional backends on top of these backends, +it would be better to [build it yourself](#build-it-yourself) by using +`nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3` or +`nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3` as a `min` container. + + +### CPU-only container composition + +CPU-only containers are not yet available for customization. Please see + [build documentation](build.md) for instructions to build a full CPU-only + container. When including TensorFlow or PyTorch backends in the composed + container, an additional `gpu-min` container is needed +since this container provided the CUDA stubs and runtime dependencies which are +not provided in the CPU only min container. + +## Build it yourself + +If you would like to do what `compose.py` is doing under the hood yourself, you + can run `compose.py` with the `--dry-run` option and then modify the + `Dockerfile.compose` file to satisfy your needs. + + +### Triton with Unsupported and Custom Backends + +You can [create and build your own Triton +backend](https://github.com/triton-inference-server/backend). The +result of that build should be a directory containing your backend +shared library and any additional files required by the +backend. Assuming your backend is called "mybackend" and that the +directory is "./mybackend", adding the following to the Dockerfile `compose.py` +created will create a Triton image that contains all the supported Triton +backends plus your custom backend. + +``` +COPY ./mybackend /opt/tritonserver/backends/mybackend +``` + +You also need to install any additional dependencies required by your +backend as part of the Dockerfile. Then use Docker to create the +image. + +``` +$ docker build -t tritonserver_custom -f Dockerfile.compose . +``` diff --git a/docs/customization_guide/deploy.md b/docs/customization_guide/deploy.md new file mode 100644 index 0000000000..112a2cebcf --- /dev/null +++ b/docs/customization_guide/deploy.md @@ -0,0 +1,279 @@ + + +# Secure Deployment Considerations + +The Triton Inference Server project is designed for flexibility and +allows developers to create and deploy inferencing solutions in a +variety of ways. Developers can deploy Triton as an http server, a +grpc server, a server supporting both, or embed a Triton server into +their own application. Developers can deploy Triton locally or in the +cloud, within a Kubernetes cluster behind an API gateway or as a +standalone process. This guide is intended to provide some key points +and best practices that users deploying Triton based solutions should +consider. + +| [Deploying Behind a Secure Gateway or Proxy](#deploying-behind-a-secure-proxy-or-gateway) | [Running with Least Privilege](#running-with-least-privilege) | + +> [!IMPORTANT] +> Ultimately the security of a solution based on Triton +> is the responsibility of the developer building and deploying that +> solution. When deploying in production settings please have security +> experts review any potential risks and threats. + +> [!WARNING] +> Dynamic updates to model repositories are disabled by +> default. Enabling dynamic updates to model repositories either +> through model loading APIs or through directory polling can lead to +> arbitrary code execution. Model repository access control is +> critical in production deployments. If dynamic updates are required, +> ensure only trusted entities have access to model loading APIs and +> model repository directories. + +## Deploying Behind a Secure Proxy or Gateway + +The Triton Inference Server is designed primarily as a microservice to +be deployed as part of a larger solution within an application +framework or service mesh. + +In such deployments it is typical to utilize dedicated gateway or +proxy servers to handle authorization, access control, resource +management, encryption, load balancing, redundancy and many other +security and availability features. + +The full design of such systems is outside the scope of this +deployment guide but in such scenarios dedicated ingress controllers +handle access from outside the trusted network while Triton Inference +Server handles only trusted, validated requests. + +In such scenarios Triton Inference Server is not exposed directly to +an untrusted network. + +### References on Secure Deployments + +In the following references, Triton Inference Server would be deployed +as an "Application" or "Service" within the trusted internal network. + +* [https://www.nginx.com/blog/architecting-zero-trust-security-for-kubernetes-apps-with-nginx/] +* [https://istio.io/latest/docs/concepts/security/] +* [https://konghq.com/blog/enterprise/envoy-service-mesh] +* [https://www.solo.io/topics/envoy-proxy/] + +## Running with Least Privilege + + The security principle of least privilege advocates that a process be + granted the minimum permissions required to do its job. + + For an inference solution based on Triton Inference Server there are a + number of ways to reduce security risks by limiting the permissions + and capabilities of the server to the minimum required for correct + operation. + +### 1. Follow Best Practices for Securing Kubernetes Deployments + + When deploying Triton within a Kubernetes pod ensure that it is + running with a service account with the fewest possible + permissions. Ensure that you have configured [role based access + control](https://kubernetes.io/docs/reference/access-authn-authz/rbac/) + to limit access to resources and capabilities as required by your + application. + +### 2. Follow Best Practices for Launching Standalone Docker Containers + + When Triton is deployed as a containerized service, standard docker + security practices apply. This includes limiting the resources that a + container has access to as well as limiting network access to the + container. https://docs.docker.com/engine/security/ + +### 3. Run as a Non-Root User + + Triton's pre-built containers contain a non-root user that can be used + to launch the tritonserver application with limited permissions. This + user, `triton-server` is created with `user id 1000`. When launching + the container using docker the user can be set with the `--user` + command line option. + +##### Example Launch Command + + ``` + docker run --rm --user triton-server -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:YY.MM-py3 tritonserver --model-repository=/models + ``` + +### 4. Restrict or Disable Access to Protocols and APIs + +The pre-built Triton Inference Serrver application enables a full set +of features including health checks, server metadata, inference apis, +shared memory apis, model and model repository configuration, +statistics, tracing and logging. Care should be taken to only expose +those capabilities that are required for your solution. + +#### Disabling Features at Compile Time + +When building a custom inference server application features can be +selectively enabled or disabled using the `build.py` script. As an +example a developer can use the flags `--endpoint http` and +`--endpoint grpc` to compile support for `http`, `grpc` or +both. Support for individual backends can be enabled as well. For more +details please see [documentation](build.md) on building a custom +inference server application. + +#### Disabling / Restricting Features at Run Time + +The `tritonserver` application provides a number of command line +options to enable and disable features when launched. For a full list +of options please see `tritonserver --help`. The following subset are +described here with basic recommendations. + +##### `--exit-on-error , default True` + +Exits the inference server if any error occurs during +initialization. Recommended to set to `True` to catch any +unanticipated errors. + +##### `--disable-auto-complete-config, default enabled` + +Disables backends from autocompleting model configuration. If not +required for your solution recommended to disable to ensure model +configurations are defined statically. + +##### `--strict-readiness , default True` + +If set to true `/v2/health/ready` will only report ready when all +selected models are loaded. Recommended to set to `True` to provide a +signal to other services and orchestration frameworks when full +initialization is complete and server is healthy. + +##### `--model-control-mode , default "none"` + +Specifies the mode for model management. + +> [!WARNING] +> Allowing dynamic updates to the model repository can lead +> to arbitrary code execution. Model repository access control is +> critical in production deployments. Unless required for operation, it's recommended +> to disable dynamic updates. If required, please ensure only trusted entities +> can add or remove models from a model repository. + +Options: + + * `none`- Models are loaded at start up and can not be modified. + * `poll`- Server process will poll the model repository for changes. + * `explicit` - Models can be loaded and unloaded via the model control APIs. + +Recommended to set to `none` unless dynamic updates are required. If +dynamic updates are required care must be taken to control access to +the model repository files and load and unload APIs. + +##### `--allow-http , default True` + +Enable HTTP request handling. Recommended to set to `False` if not required. + +##### `--allow-grpc , default True` + +Enable gRPC request handling. Recommended to set to `False` if not required. + +##### `--grpc-use-ssl default False` + +Use SSL authentication for gRPC requests. Recommended to set to `True` if service is not protected by a gateway or proxy. + +##### `--grpc-use-ssl-mutual default False` + +Use mutual SSL authentication for gRPC requests. Recommended to set to `True` if service is not protected by a gateway or proxy. + +##### `--grpc-restricted-protocol <:=>` + +Restrict access to specific gRPC protocol categories to users with +specific key, value pair shared secret. See +[limit-endpoint-access](inference_protocols.md#limit-endpoint-access-beta) +for more information. + +> [!Note] +> Restricting access can be used to limit exposure to model +> control APIs to trusted users. + +##### `--http-restricted-api <:=>` + +Restrict access to specific HTTP API categories to users with +specific key, value pair shared secret. See +[limit-endpoint-access](inference_protocols.md#limit-endpoint-access-beta) +for more information. + +> [!Note] +> Restricting access can be used to limit exposure to model +> control APIs to trusted users. + +##### `--allow-sagemaker default False` + +Enable Sagemaker request handling. Recommended to set to `False` unless required. + +##### `--allow-vertex-ai default depends on environment variable` + +Enable Vertex AI request handling. Default is `True` if +`AIP_MODE=PREDICTION`, `False` otherwise. Recommended to set to +`False` unless required. + +##### `--allow-metrics default True` + +Allow server to publish prometheus style metrics. Recommended to set +to `False` if not required to avoid capturing or exposing any sensitive information. + +#### `--trace-config level= default "off"` + +Tracing mode. Trace mode supports `triton` and `opentelemetry`. Unless required `--trace-config level=off` should be set to avoid capturing or exposing any sensitive information. + + +##### `backend-directory default /opt/tritonserver/backends` + +Directory where backend shared libraries are found. + +> [!Warning] +> Access to add or remove files from the backend directory +> must be access controlled. Adding untrusted files +> can lead to arbitrarty code execution. + +##### `repoagent-directory default /opt/tritonserver/repoagents` +Directory where repository agent shared libraries are found. + +> [!Warning] +> Access to add or remove files from the repoagent directory +> must be access controlled. Adding untrusted files +> can lead to arbitrarty code execution. + +##### `cache-directory default /opt/tritonserver/caches` + +Directory where cache shared libraries are found. + +> [!Warning] +> Access to add or remove files from the cache directory +> must be access controlled. Adding untrusted files +> can lead to arbitrarty code execution. + + + + + diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md new file mode 100644 index 0000000000..a241f097da --- /dev/null +++ b/docs/customization_guide/inference_protocols.md @@ -0,0 +1,516 @@ + + +# Inference Protocols and APIs + +Clients can communicate with Triton using either an [HTTP/REST +protocol](#httprest-and-grpc-protocols), a [GRPC +protocol](#httprest-and-grpc-protocols), or by an [in-process C +API](#in-process-triton-server-api) or its +[C++ wrapper](https://github.com/triton-inference-server/developer_tools/tree/main/server). + +## HTTP/REST and GRPC Protocols + +Triton exposes both HTTP/REST and GRPC endpoints based on [standard +inference +protocols](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2) +that have been proposed by the [KServe +project](https://github.com/kserve). To fully enable all capabilities +Triton also implements [HTTP/REST and GRPC +extensions](https://github.com/triton-inference-server/server/tree/main/docs/protocol) +to the KServe inference protocol. GRPC protocol also provides a +bi-directional streaming version of the inference RPC to allow a +sequence of inference requests/responses to be sent over a +GRPC stream. We typically recommend using the unary version for +inference requests. The streaming version should be used only if the +situation demands it. Some of such use cases can be: + +* Assume a system with multiple Triton server instances running + behind a Load Balancer. If a sequence of inference requests is + needed to hit the same Triton server instance, a GRPC stream + will hold a single connection throughout the lifetime and hence + ensure the requests are delivered to the same Triton instance. +* If the order of requests/responses needs to be preserved over + the network, a GRPC stream will ensure that the server receives + the requests in the same order as they were sent from the + client. + +The HTTP/REST and GRPC protocols also provide endpoints to check +server and model health, metadata and statistics. Additional +endpoints allow model loading and unloading, and inferencing. See +the KServe and extension documentation for details. + +### HTTP Options +Triton provides the following configuration options for server-client network transactions over HTTP protocol. + +#### Compression + +Triton allows the on-wire compression of request/response on HTTP through its clients. See [HTTP Compression](https://github.com/triton-inference-server/client/tree/main#compression) for more details. + +### GRPC Options +Triton exposes various GRPC parameters for configuring the server-client network transactions. For usage of these options, refer to the output from `tritonserver --help`. + +#### SSL/TLS + +These options can be used to configure a secured channel for communication. The server-side options include: + +* `--grpc-use-ssl` +* `--grpc-use-ssl-mutual` +* `--grpc-server-cert` +* `--grpc-server-key` +* `--grpc-root-cert` + +For client-side documentation, see [Client-Side GRPC SSL/TLS](https://github.com/triton-inference-server/client/tree/main#ssltls) + +For more details on overview of authentication in gRPC, refer [here](https://grpc.io/docs/guides/auth/). + +#### Compression + +Triton allows the on-wire compression of request/response messages by exposing following option on server-side: + +* `--grpc-infer-response-compression-level` + +For client-side documentation, see [Client-Side GRPC Compression](https://github.com/triton-inference-server/client/tree/main#compression-1) + +Compression can be used to reduce the amount of bandwidth used in server-client communication. For more details, see [gRPC Compression](https://grpc.github.io/grpc/core/md_doc_compression.html). + +#### GRPC KeepAlive + +Triton exposes GRPC KeepAlive parameters with the default values for both +client and server described [here](https://github.com/grpc/grpc/blob/master/doc/keepalive.md). + +These options can be used to configure the KeepAlive settings: + +* `--grpc-keepalive-time` +* `--grpc-keepalive-timeout` +* `--grpc-keepalive-permit-without-calls` +* `--grpc-http2-max-pings-without-data` +* `--grpc-http2-min-recv-ping-interval-without-data` +* `--grpc-http2-max-ping-strikes` + +For client-side documentation, see [Client-Side GRPC KeepAlive](https://github.com/triton-inference-server/client/blob/main/README.md#grpc-keepalive). + +#### GRPC Status Codes + +Triton implements GRPC error handling for streaming requests when a specific flag is enabled through headers. Upon encountering an error, Triton returns the appropriate GRPC error code and subsequently closes the stream. + +* `triton_grpc_error` : The header value needs to be set to true while starting the stream. + +GRPC status codes can be used for better visibility and monitoring. For more details, see [gRPC Status Codes](https://grpc.io/docs/guides/status-codes/) + +For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes) + +### Limit Endpoint Access (BETA) + +Triton users may want to restrict access to protocols or APIs that are +provided by the GRPC or HTTP endpoints of a server. For example, users +can provide one set of access credentials for inference APIs and +another for model control APIs such as model loading and unloading. + +The following options can be specified to declare a restricted +protocol group (GRPC) or restricted API group (HTTP): + +``` +--grpc-restricted-protocol=,,...:= +--http-restricted-api=,API_2>,...:= +``` + +The option can be specified multiple times to specifies multiple groups of +protocols or APIs with different restriction settings. + +* `protocols / APIs` : A comma-separated list of protocols / APIs to be included in this +group. Note that currently a given protocol / API is not allowed to be included in +multiple groups. The following protocols / APIs are recognized: + + * `health` : Health endpoint defined for [HTTP/REST](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#health) and [GRPC](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#health-1). For GRPC endpoint, this value also exposes [GRPC health check protocol](https://github.com/triton-inference-server/common/blob/main/protobuf/health.proto). + * `metadata` : Server / model metadata endpoints defined for [HTTP/REST](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#server-metadata) and [GRPC](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#server-metadata-1). + * `inference` : Inference endpoints defined for [HTTP/REST](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#inference) and [GRPC](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#inference-1). + * `shared-memory` : [Shared-memory endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md). + * `model-config` : [Model configuration endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_configuration.md). + * `model-repository` : [Model repository endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md). + * `statistics` : [statistics endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md). + * `trace` : [trace endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_trace.md). + * `logging` : [logging endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_logging.md). + +* `restricted-key` : The GRPC / HTTP request header +to be checked when a request is received. The +completed header for GRPC will be in the form of +`triton-grpc-protocol-`. The completed header for HTTP +will be in the form of ``. + +* `restricted-value` : The header value required to access the specified protocols. + +#### Example + +To start the server with a set of protocols and APIs restricted for +`admin` usage and the rest of the protocols and APIs left unrestricted +use the following command line arguments: + + +``` +tritonserver --grpc-restricted-protocol=shared-memory,model-config,model-repository,statistics,trace:= \ + --http-restricted-api=shared-memory,model-config,model-repository,statistics,trace:= ... +``` + +GRPC requests to `admin` protocols require that an additional header +`triton-grpc-protocol-` is provided with value +``. HTTP requests to `admin` APIs required that an +additional header `` is provided with value ``. + + +## In-Process Triton Server API + +The Triton Inference Server provides a backwards-compatible C API that +allows Triton to be linked directly into a C/C++ application. This API +is called the "Triton Server API" or just "Server API" for short. The +API is implemented in the Triton shared library which is built from +source contained in the [core +repository](https://github.com/triton-inference-server/core). On Linux +this library is libtritonserver.so and on Windows it is +tritonserver.dll. In the Triton Docker image the shared library is +found in /opt/tritonserver/lib. The header file that defines and +documents the Server API is +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). +[Java bindings for In-Process Triton Server API](#java-bindings-for-in-process-triton-server-api) +are built on top of `tritonserver.h` and can be used for Java applications that +need to use Tritonserver in-process. + +All capabilities of Triton server are encapsulated in the shared +library and are exposed via the Server API. The `tritonserver` +executable implements HTTP/REST and GRPC endpoints and uses the Server +API to communicate with core Triton logic. The primary source files +for the endpoints are [grpc_server.cc](https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc) and +[http_server.cc](https://github.com/triton-inference-server/server/blob/main/src/http_server.cc). In these source files you can +see the Server API being used. + +You can use the Server API in your own application as well. A simple +example using the Server API can be found in +[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). + +### API Description + +Triton server functionality is encapsulated in a shared library which +is built from source contained in the [core +repository](https://github.com/triton-inference-server/core). You can +include the full capabilities of Triton by linking the shared library +into your application and by using the C API defined in +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). + +When you link the Triton shared library into your application you are +*not* spawning a separate Triton process, instead, you are including +the Triton core logic directly in your application. The Triton +HTTP/REST or GRPC protocols are not used to communicate with this +Triton core logic, instead all communication between your application +and the Triton core logic must take place via the [Server +API](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). + +The top-level abstraction used by Server API is `TRITONSERVER_Server`, +which represents the Triton core logic that is capable of implementing +all of the features and capabilities of Triton. A +`TRITONSERVER_Server` object is created by calling +`TRITONSERVER_ServerNew` with a set of options that indicate how the +object should be initialized. Use of `TRITONSERVER_ServerNew` is +demonstrated in [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). Once you have created a +`TRITONSERVER_Server` object, you can begin using the rest of the +Server API as described below. + +#### Error Handling + +Most Server API functions return an error object indicating success or +failure. Success is indicated by return `nullptr` (`NULL`). Failure is +indicated by returning a `TRITONSERVER_Error` object. The error code +and message can be retrieved from a `TRITONSERVER_Error` object with +`TRITONSERVER_ErrorCode` and `TRITONSERVER_ErrorMessage`. + +The lifecycle and ownership of all Server API objects is documented in +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). For +`TRITONSERVER_Error`, ownership of the object passes to the caller of +the Server API function. As a result, your application is responsible +for managing the lifecycle of the returned `TRITONSERVER_Error` +object. You must delete the error object using +`TRITONSERVER_ErrorDelete` when you are done using it. Macros such as +`FAIL_IF_ERR` shown in [common.h](https://github.com/triton-inference-server/server/blob/main/src/common.h) are useful for +managing error object lifetimes. + +#### Versioning and Backwards Compatibility + +A typical pattern, demonstrated in [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) and +shown below, shows how you can compare the Server API version provided +by the shared library against the Server API version that you compiled +your application against. The Server API is backwards compatible, so +as long as the major version provided by the shared library matches +the major version that you compiled against, and the minor version +provided by the shared library is greater-than-or-equal to the minor +version that you compiled against, then your application can use the +Server API. + +``` +#include "tritonserver.h" +// Error checking removed for clarity... +uint32_t api_version_major, api_version_minor; +TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor); +if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major) || + (TRITONSERVER_API_VERSION_MINOR > api_version_minor)) { + // Error, the shared library implementing the Server API is older than + // the version of the Server API that you compiled against. +} +``` + +#### Non-Inference APIs + +The Server API contains functions for checking health and readiness, +getting model information, getting model statistics and metrics, +loading and unloading models, etc. The use of these functions is +straightforward and some of these functions are demonstrated in +[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) and all are documented in +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). + +#### Inference APIs + +Performing an inference request requires the use of many Server API +functions and objects, as demonstrated in +[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). The general usage requires the +following steps. + +* Create a `TRITONSERVER_ResponseAllocator` using + `TRITONSERVER_ResponseAllocatorNew`. You can use the same response + allocator for all of your inference requests, or you can create + multiple response allocators. When Triton produces an output + tensor, it needs a memory buffer into which it can store the + contents of that tensor. Triton defers the allocation of these + output buffers by invoking callback functions in your + application. You communicate these callback functions to Triton with + the `TRITONSERVER_ResponseAllocator` object. You must implement two + callback functions, one for buffer allocation and one for buffer + free. The signatures for these functions are + `TRITONSERVER_ResponseAllocatorAllocFn_t` and + `TRITONSERVER_ResponseAllocatorReleaseFn_t` as defined in + [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). In + [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc), these callback functions are + implemented as `ResponseAlloc` and `ResponseRelease`. + +* Create an inference request as a `TRITONSERVER_InferenceRequest` + object. The inference request is where you specify what model you + want to use, the input tensors and their values, the output tensors + that you want returned, and other request parameters. You create an + inference request using `TRITONSERVER_InferenceRequestNew`. You + create each input tensor in the request using + `TRITONSERVER_InferenceRequestAddInput` and set the data for the + input tensor using `TRITONSERVER_InferenceRequestAppendInputData` + (or one of the `TRITONSERVER_InferenceRequestAppendInputData*` + variants defined in + [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h)). By + default, Triton will return all output tensors, but you can limit + Triton to only return some outputs by using + `TRITONSERVER_InferenceRequestAddRequestedOutput`. + + To correctly manage the lifecycle of the inference request, you must + use `TRITONSERVER_InferenceRequestSetReleaseCallback` to set a + callback into a function in your application. This callback will be + invoke by Triton to return ownership of the + `TRITONSERVER_InferenceRequest` object. Typically, in this callback + you will just delete the `TRITONSERVER_InferenceRequest` object by + using `TRITONSERVER_InferenceRequestDelete`. But you may also + implement a different lifecycle management; for example, if you are + reusing inference request objects you would want to make the object + available for reuse. + + You can optionally use `TRITONSERVER_InferenceRequestSetId` to set a + user-defined ID on the request. This ID is not used by Triton but + will be returned in the response. + + You can reuse an existing `TRITONSERVER_InferenceRequest` object for + a new inference request. A couple of examples of how this is done + and why it is useful are shown in [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). + +* Ask Triton to execute the inference request using + `TRITONSERVER_ServerInferAsync`. `TRITONSERVER_ServerInferAsync` is + a asynchronous call that returns immediately. The inference response + is returned via a callback into your application. You register this + callback using `TRITONSERVER_InferenceRequestSetResponseCallback` + before you invoke `TRITONSERVER_ServerInferAsync`. In + [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) this callback is + `InferResponseComplete`. + + When you invoke `TRITONSERVER_ServerInferAsync` and it returns + without error, you are passing ownership of the + `TRITONSERVER_InferenceRequest` object to Triton, and so you must + not access that object in any way until Triton returns ownership to + you via the callback you registered with + `TRITONSERVER_InferenceRequestSetReleaseCallback`. + +* Process the inference response. The inference response is returned + to the callback function you registered with + `TRITONSERVER_InferenceRequestSetResponseCallback`. Your callback + receives the response as a `TRITONSERVER_InferenceResponse` + object. Your callback takes ownership of the + `TRITONSERVER_InferenceResponse` object and so must free it with + `TRITONSERVER_InferenceResponseDelete` when it is no longer needed. + + The first step in processing a response is to use + `TRITONSERVER_InferenceResponseError` to check if the response is + returning an error or if it is returning valid results. If the + response is valid you can use + `TRITONSERVER_InferenceResponseOutputCount` to iterate over the + output tensors, and `TRITONSERVER_InferenceResponseOutput` to get + information about each output tensor. + + Note that the [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) example uses a + std::promise to simply wait for the response, but synchronizing + response handling in this way is not required. You can have multiple + inference requests in flight at the same time and can issue + inference requests from the same thread or from multiple different + threads. +allows Triton to be linked directly to a C/C++ application. The API +is documented in +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). + +A simple example using the C API can be found in +[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). A more complicated example can be +found in the source that implements the HTTP/REST and GRPC endpoints +for Triton. These endpoints use the C API to communicate with the core +of Triton. The primary source files for the endpoints are +[grpc_server.cc](https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc) and +[http_server.cc](https://github.com/triton-inference-server/server/blob/main/src/http_server.cc). + +## Java bindings for In-Process Triton Server API + +The Triton Inference Server uses [Java CPP](https://github.com/bytedeco/javacpp) +to create bindings around Tritonserver to create Java API. + +The API is documented in +[tritonserver.java](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/src/gen/java/org/bytedeco/tritonserver/global/tritonserver.java). +Alternatively, the user can refer to the web version [API docs](http://bytedeco.org/javacpp-presets/tritonserver/apidocs/) +generated from `tritonserver.java`. +**Note:** Currently, `tritonserver.java` contains bindings for both the `In-process C-API` +and the bindings for `C-API Wrapper`. More information about the [developer_tools/server C-API wrapper](https://github.com/triton-inference-server/developer_tools/blob/main/server/README.md) can be found in the [developer_tools repository](https://github.com/triton-inference-server/developer_tools/). + +A simple example using the Java API can be found in +[Samples folder](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver/samples) +which includes `Simple.java` which is similar to +[`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). +Please refer to +[sample usage documentation](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver#sample-usage) +to learn about how to build and run `Simple.java`. + +In the [QA folder](https://github.com/triton-inference-server/server/blob/main/qa), folders starting with L0_java include Java API tests. +These can be useful references for getting started, such as the +[ResNet50 test](https://github.com/triton-inference-server/server/blob/main/qa/L0_java_resnet). + +### Java API setup instructions + +To use the Tritonserver Java API, you will need to have the Tritonserver library +and dependencies installed in your environment. There are two ways to do this: + +1. Use a Tritonserver docker container with + 1. `.jar` Java bindings to C API (recommended) + 2. maven and build bindings yourself +2. Build Triton from your environment without Docker (not recommended) + +#### Run Tritonserver container and install dependencies + +To set up your environment with Triton Java API, please follow the following steps: +1. First run Docker container: +``` + $ docker run -it --gpus=all -v ${pwd}:/workspace nvcr.io/nvidia/tritonserver:-py3 bash +``` +2. Install `jdk`: +```bash + $ apt update && apt install -y openjdk-11-jdk +``` +3. Install `maven` (only if you want to build the bindings yourself): +```bash +$ cd /opt/tritonserver + $ wget https://archive.apache.org/dist/maven/maven-3/3.8.4/binaries/apache-maven-3.8.4-bin.tar.gz + $ tar zxvf apache-maven-3.8.4-bin.tar.gz + $ export PATH=/opt/tritonserver/apache-maven-3.8.4/bin:$PATH +``` + +#### Run Java program with Java bindings Jar + +After ensuring that Tritonserver and dependencies are installed, you can run your +Java program with the Java bindings with the following steps: + +1. Place Java bindings into your environment. You can do this by either: + + a. Building Java API bindings with provided build script: + ```bash + # Clone Triton client repo. Recommended client repo tag is: main + $ git clone --single-branch --depth=1 -b + https://github.com/triton-inference-server/client.git clientrepo + # Run build script + ## For In-Process C-API Java Bindings + $ source clientrepo/src/java-api-bindings/scripts/install_dependencies_and_build.sh + ## For C-API Wrapper (Triton with C++ bindings) Java Bindings + $ source clientrepo/src/java-api-bindings/scripts/install_dependencies_and_build.sh --enable-developer-tools-server + ``` + This will install the Java bindings to `/workspace/install/java-api-bindings/tritonserver-java-bindings.jar` + + *or* + + b. Copying "Uber Jar" from Triton SDK container to your environment + ```bash + $ id=$(docker run -dit nvcr.io/nvidia/tritonserver:-py3-sdk bash) + $ docker cp ${id}:/workspace/install/java-api-bindings/tritonserver-java-bindings.jar /tritonserver-java-bindings.jar + $ docker stop ${id} + ``` + **Note:** `tritonserver-java-bindings.jar` only includes the `In-Process Java Bindings`. To use the `C-API Wrapper Java Bindings`, please use the build script. +2. Use the built "Uber Jar" that contains the Java bindings + ```bash + $ java -cp /tritonserver-java-bindings.jar + ``` + +#### Build Java bindings and run Java program with Maven + +If you want to make changes to the Java bindings, then you can use Maven to +build yourself. You can refer to part 1.a of [Run Java program with Java +bindings Jar](#run-java-program-with-java-bindings-jar) to also build the jar +yourself without any modifications to the Tritonserver bindings in +JavaCPP-presets. +You can do this using the following steps: + +1. Create the JNI binaries in your local repository (`/root/.m2/repository`) + with [`javacpp-presets/tritonserver`](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver). + For C-API Wrapper Java bindings (Triton with C++ bindings), you need to + install some build specific dependencies including cmake and rapidjson. + Refer to [java installation script](https://github.com/triton-inference-server/client/blob/main/src/java-api-bindings/scripts/install_dependencies_and_build.sh) + for dependencies you need to install and modifications you need to make for your container. +After installing dependencies, you can build the tritonserver project on javacpp-presets: +```bash + $ git clone https://github.com/bytedeco/javacpp-presets.git + $ cd javacpp-presets + $ mvn clean install --projects .,tritonserver + $ mvn clean install -f platform --projects ../tritonserver/platform -Djavacpp.platform=linux-x86_64 +``` +2. Create your custom `*.pom` file for Maven. Please refer to + [samples/simple/pom.xml](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/samples/simple/pom.xml) as + reference for how to create your pom file. +3. After creating your `pom.xml` file you can build your application with: +```bash + $ mvn compile exec:java -Djavacpp.platform=linux-x86_64 -Dexec.args="" +``` diff --git a/docs/customization_guide/repository_agents.md b/docs/customization_guide/repository_agents.md new file mode 100644 index 0000000000..02fb1d57ec --- /dev/null +++ b/docs/customization_guide/repository_agents.md @@ -0,0 +1,176 @@ + + +# Repository Agent + +A *repository agent* extends Triton with new functionality that +operates when a model is loaded or unloaded. You can introduce your +own code to perform authentication, decryption, conversion, or similar +operations when a model is loaded. + +**BETA: The repository agent API is beta quality and is subject to +non-backward-compatible changes for one or more releases.** + +A repository agent comunicates with Triton using the [repository agent +API](https://github.com/triton-inference-server/core/tree/main/include/triton/core/tritonrepoagent.h). The +[checksum_repository_agent GitHub +repo](https://github.com/triton-inference-server/checksum_repository_agent) +provides an example repository agent that verifies file checksums +before loading a model. + +## Using a Repository Agent + +A model can use one or more repository agents by specifying them in +the *ModelRepositoryAgents* section of the [model +configuration](../user_guide/model_configuration.md). Each repository agent can have +parameters specific to that agent that are specified in the model +configuration to control the behavior of the agent. To understand the +parameters available for a given agent consult the documentation for +that agent. + +Multiple agents may be specified for the same model and they will be +invoked in order when a model is loaded or unloaded. The following +example model configuration contents shows how two agents, "agent0" +and "agent1", are specified so that they are invoked in that order +with the given parameters. + +``` +model_repository_agents +{ + agents [ + { + name: "agent0", + parameters [ + { + key: "key0", + value: "value0" + }, + { + key: "key1", + value: "value1" + } + ] + }, + { + name: "agent1", + parameters [ + { + key: "keyx", + value: "valuex" + } + ] + } + ] +} +``` + +## Implementing a Repository Agent + +A repository agent must be implemented as a shared library and the +name of the shared library must be +*libtritonrepoagent_\.so*. The shared library should +hide all symbols except those needed by the repository agent API. See +the [checksum example's +CMakeList.txt](https://github.com/triton-inference-server/checksum_repository_agent/blob/main/CMakeLists.txt) +for an example of how to use an ldscript to expose only the necessary +symbols. + +The shared library will be dynamically loaded by Triton when it is +needed. For a repository agent called *A*, the shared library must be +installed as \/A/libtritonrepoagent_A.so. +Where \ is by default +/opt/tritonserver/repoagents. The --repoagent-directory flag can be +used to override the default. + +Your repository agent must implement the repository agent API as +documented in +[tritonrepoagent.h](https://github.com/triton-inference-server/core/tree/main/include/triton/core/tritonrepoagent.h). + +Triton follows these steps when loading a model: + +* Load the model's configuration file (config.pbtxt) and extract the + *ModelRepositoryAgents* settings. Even if a repository agent + modifies the config.pbtxt file, the repository agent settings from + the initial config.pbtxt file are used for the entire loading + process. + +* For each repository agent specified: + + * Initialize the corresponding repository agent, loading the shared + library if necessary. Model loading fails if the shared library is + not available or if initialization fails. + + * Invoke the repository agent's *TRITONREPOAGENT_ModelAction* + function with action TRITONREPOAGENT_ACTION_LOAD. As input the + agent can access the model's repository as either a cloud storage + location or a local filesystem location. + + * The repository agent can return *success* to indicate that no + changes where made to the repository, can return *failure* to + indicate that the model load should fail, or can create a new + repository for the model (for example, by decrypting the input + repository) and return *success* to indicate that the new + repository should be used. + + * If the agent returns *success* Triton continues to the next + agent. If the agent returns *failure*, Triton skips invocation of + any additional agents. + +* If all agents returned *success*, Triton attempts to load the model + using the final model repository. + +* For each repository agent that was invoked with + TRITONREPOAGENT_ACTION_LOAD, in reverse order: + + * Triton invokes the repository agent's + *TRITONREPOAGENT_ModelAction* function with action + TRITONREPOAGENT_ACTION_LOAD_COMPLETE if the model loaded + successfully or TRITONREPOAGENT_ACTION_LOAD_FAIL if the model + failed to load. + +Triton follows these steps when unloading a model: + +* Triton uses the repository agent settings from the initial + config.pbtxt file, even if during loading one or more agents + modified its contents. + +* For each repository agent that was invoked with + TRITONREPOAGENT_ACTION_LOAD, in the same order: + + * Triton invokes the repository agent's + *TRITONREPOAGENT_ModelAction* function with action + TRITONREPOAGENT_ACTION_UNLOAD. + +* Triton unloads the model. + +* For each repository agent that was invoked with + TRITONREPOAGENT_ACTION_UNLOAD, in reverse order: + + * Triton invokes the repository agent's + *TRITONREPOAGENT_ModelAction* function with action + TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE. diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md new file mode 100644 index 0000000000..8487e6e3ad --- /dev/null +++ b/docs/customization_guide/test.md @@ -0,0 +1,137 @@ + + +# Testing Triton + +Currently there is no CI testing enabled for Triton repositories. We +will enable CI testing in a future update. + +However, there is a set of tests in the qa/ directory that can be run +manually to provide extensive testing. Before running these tests you +must first generate a few model repositories containing the models +needed by the tests. + +## Generate QA Model Repositories + +The QA model repositories contain some simple models that are used to +verify the correctness of Triton. To generate the QA model +repositories: + +``` +$ cd qa/common +$ ./gen_qa_model_repository +$ ./gen_qa_custom_ops +``` + +This will create multiple model repositories in /tmp/\/qa_* +(for example /tmp/24.09/qa_model_repository). The TensorRT models +will be created for the GPU on the system that CUDA considers device 0 +(zero). If you have multiple GPUs on your system see the documentation +in the scripts for how to target a specific GPU. + +## Build SDK Image + +Build the *tritonserver_sdk* image that contains the client +libraries, model analyzer, perf analyzer and examples using the following +commands. You must first checkout the `` branch of the +*client* repo into the clientrepo/ subdirectory and the `` +branch of the *perf_analyzer* repo into the perfanalyzerrepo/ subdirectory +respectively. Typically you want to set both `` and `` +to be the same as your current server branch. + +``` +$ cd +$ git clone --single-branch --depth=1 -b https://github.com/triton-inference-server/client.git clientrepo +$ git clone --single-branch --depth=1 -b https://github.com/triton-inference-server/perf_analyzer.git perfanalyzerrepo +$ docker build -t tritonserver_sdk -f Dockerfile.sdk . +``` + +## Build QA Image + +Next you need to build a QA version of the Triton Docker image. This +image will contain Triton, the QA tests, and all the dependencies +needed to run the QA tests. First do a [Docker image +build](build.md#building-with-docker) to produce the +*tritonserver_cibase* and *tritonserver* images. + +Then, build the actual QA image. + +``` +$ docker build -t tritonserver_qa -f Dockerfile.QA . +``` + +## Run QA Tests + +Now run the QA image and mount the QA model repositories into the +container so the tests will be able to access them. + +``` +$ docker run --gpus=all -it --rm -v/tmp:/data/inferenceserver tritonserver_qa +``` + +Within the container the QA tests are in /opt/tritonserver/qa. To run +a test, change directory to the test and run the test.sh script. + +``` +$ cd +$ bash -x ./test.sh +``` + +### Sanity Tests + +Many tests require that you use a complete Triton build, with all +backends and other features enabled. There are three sanity tests that +are parameterized so that you can run them even if you have built a +Triton that contains only a subset of all supported Triton +backends. These tests are L0_infer, L0_batcher and +L0_sequence_batcher. For these tests the following envvars are +available to control how the tests behave: + +* BACKENDS: Control which backends are tested. Look in the test.sh + file of the test to see the default and allowed values. + +* ENSEMBLES: Enable testing of ensembles. Set to "0" to disable, set + to "1" to enable. If enabled you must have the *identity* backend + included in your Triton build. + +* EXPECTED_NUM_TESTS: The tests perform a check of the total number of + test sub-cases. The exact number of sub-cases that run will depend + on the values you use for BACKENDS and ENSEMBLES. So you will need + to adjust this as appropriate for your testing. + +For example, if you build a Triton that has only the TensorRT backend +you can run L0_infer as follows: + +``` +$ BACKENDS="plan" ENSEMBLES=0 EXPECTED_NUM_TESTS= bash -x ./test.sh +``` + +Where '\' is the number of sub-tests expected to be run for +just TensorRT testing and no ensembles. Depending on which backend(s) +you are testing you will need to experiment and determine the correct +value for '\'. diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md new file mode 100644 index 0000000000..3b47e4dbee --- /dev/null +++ b/docs/customization_guide/tritonfrontend.md @@ -0,0 +1,146 @@ + +### Triton Server (tritonfrontend) Bindings (Beta) + +The `tritonfrontend` python package is a set of bindings to Triton's existing +frontends implemented in C++. Currently, `tritonfrontend` supports starting up +`KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination +with Triton's Python In-Process API +([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) +and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) +extend the ability to use Triton's full feature set with a few lines of Python. + +Let us walk through a simple example: +1. First we need to load the desired models and start the server with `tritonserver`. +```python +import tritonserver + +# Constructing path to Model Repository +model_path = f"server/src/python/examples/example_model_repository" + +server_options = tritonserver.Options( + server_id="ExampleServer", + model_repository=model_path, + log_error=True, + log_warn=True, + log_info=True, +) +server = tritonserver.Server(server_options).start(wait_until_ready=True) +``` +Note: `model_path` may need to be edited depending on your setup. + + +2. Now, to start up the respective services with `tritonfrontend` +```python +from tritonfrontend import KServeHttp, KServeGrpc +http_options = KServeHttp.Options(thread_count=5) +http_service = KServeHttp.Server(server, http_options) +http_service.start() + +# Default options (if none provided) +grpc_service = KServeGrpc.Server(server) +grpc_service.start() +``` + +3. Finally, with running services, we can use `tritonclient` or simple `curl` commands to send requests and receive responses from the frontends. + +```python +import tritonclient.http as httpclient +import numpy as np # Use version numpy < 2 +model_name = "identity" # output == input +url = "localhost:8000" + +# Create a Triton client +client = httpclient.InferenceServerClient(url=url) + +# Prepare input data +input_data = np.array([["Roger Roger"]], dtype=object) + +# Create input and output objects +inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + +# Set the data for the input tensor +inputs[0].set_data_from_numpy(input_data) + +results = client.infer(model_name, inputs=inputs) + +# Get the output data +output_data = results.as_numpy("OUTPUT0") + +# Print results +print("[INFERENCE RESULTS]") +print("Output data:", output_data) + +# Stop respective services and server. +http_service.stop() +grpc_service.stop() +server.stop() +``` + +--- + +Additionally, `tritonfrontend` provides context manager support as well. So steps 2-3, could also be achieved through: +```python +from tritonfrontend import KServeHttp +import tritonclient.http as httpclient +import numpy as np # Use version numpy < 2 + +with KServeHttp.Server(server) as http_service: + # The identity model returns an exact duplicate of the input data as output + model_name = "identity" + url = "localhost:8000" + # Create a Triton client + with httpclient.InferenceServerClient(url=url) as client: + # Prepare input data + input_data = np.array(["Roger Roger"], dtype=object) + # Create input and output objects + inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + # Perform inference + results = client.infer(model_name, inputs=inputs) + # Get the output data + output_data = results.as_numpy("OUTPUT0") + # Print results + print("[INFERENCE RESULTS]") + print("Output data:", output_data) + +server.stop() +``` +With this workflow, you can avoid having to stop each service after client requests have terminated. + + +## Known Issues +- The following features are not currently supported when launching the Triton frontend services through the python bindings: + - [Tracing](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/trace.md) + - [Shared Memory](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md) + - [Metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) + - [Restricted Protocols](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#limit-endpoint-access-beta) + - VertexAI + - Sagemaker +- After a running server has been stopped, if the client sends an inference request, a Segmentation Fault will occur. \ No newline at end of file diff --git a/docs/examples/README.md b/docs/examples/README.md new file mode 100644 index 0000000000..84bfcb9499 --- /dev/null +++ b/docs/examples/README.md @@ -0,0 +1,35 @@ + + +# Triton Examples + +**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) to begin your Triton journey! + +This folder contains the following: +* jetson: This covers deploying Triton Inference Server on Jetson devices. +* model_repository: This folder is a basic model repository for deploying models using the Triton Inference Server. \ No newline at end of file diff --git a/docs/examples/fetch_models.sh b/docs/examples/fetch_models.sh index 0612dfc6cb..f5aaed85aa 100755 --- a/docs/examples/fetch_models.sh +++ b/docs/examples/fetch_models.sh @@ -27,16 +27,14 @@ set -ex -# Caffe2 resnet50 -mkdir -p model_repository/resnet50_netdef/1 -wget -O model_repository/resnet50_netdef/1/model.netdef \ - http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/predict_net.pb -wget -O model_repository/resnet50_netdef/1/init_model.netdef \ - http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/init_net.pb - # TensorFlow inception mkdir -p model_repository/inception_graphdef/1 wget -O /tmp/inception_v3_2016_08_28_frozen.pb.tar.gz \ https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz (cd /tmp && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz) mv /tmp/inception_v3_2016_08_28_frozen.pb model_repository/inception_graphdef/1/model.graphdef + +# ONNX densenet +mkdir -p model_repository/densenet_onnx/1 +wget -O model_repository/densenet_onnx/1/model.onnx \ + https://github.com/onnx/models/raw/main/validated/vision/classification/densenet-121/model/densenet-7.onnx diff --git a/docs/examples/jetson/README.md b/docs/examples/jetson/README.md new file mode 100644 index 0000000000..77a20474b9 --- /dev/null +++ b/docs/examples/jetson/README.md @@ -0,0 +1,68 @@ + + +# Using Triton Inference Server as a shared library for execution on Jetson + +## Overview +This project demonstrates how to run C API applications using Triton Inference Server as a shared library. We also show how to build and execute such applications on Jetson. + +### Prerequisites + +* JetPack >= 4.6 +* OpenCV >= 4.1.1 +* TensorRT >= 8.0.1.6 + +### Installation + +Follow the installation instructions from the GitHub release page ([https://github.com/triton-inference-server/server/releases/](https://github.com/triton-inference-server/server/releases/)). + +In our example, we placed the contents of downloaded release directory under `/opt/tritonserver`. + +## Part 1. Concurrent inference and dynamic batching + +The purpose of the sample located under [concurrency_and_dynamic_batching](concurrency_and_dynamic_batching/README.md) +is to demonstrate the important features of Triton Inference Server such as concurrent model execution and +dynamic batching. In order to do that, we implemented a people detection application using C API and Triton +Inference Server as a shared library. + +## Part 2. Analyzing model performance with perf_analyzer + +To analyze model performance on Jetson, +[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) +tool is used. The `perf_analyzer` is included in the release tar file or can be +compiled from source. + +From this directory of the repository, execute the following to evaluate model performance: + +```shell +./perf_analyzer -m peoplenet -b 2 --service-kind=triton_c_api --model-repo=$(pwd)/concurrency_and_dynamic_batching/trtis_model_repo_sample_1 --triton-server-directory=/opt/tritonserver --concurrency-range 1:6 -f perf_c_api.csv +``` + +In the example above we saved the results as a `.csv` file. To visualize these +results, follow the steps described +[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md). diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile new file mode 100644 index 0000000000..6506314999 --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile @@ -0,0 +1,47 @@ +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TARGET=people_detection +GCC=g++ +GCC_PARMS+=-I../../server -I/usr/include/opencv4 -I../../core/include/ -I/usr/local/cuda/targets/aarch64-linux/include +GCC_PARMS+=-I${HOME}/tritonserver/include/tritonserver -D TRITON_ENABLE_GPU=ON -D TRITON_MIN_COMPUTE_CAPABILITY=5.3 + +GCC_LIBS=-L${HOME}/tritonserver/lib -L/usr/lib -L/usr/local/cuda/targets/aarch64-linux/lib +GCC_LIBS+=-lpthread -ltritonserver -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -lopencv_dnn -lcudart + +all: $(TARGET) + + +%.o: %.cc + $(GCC) $(GCC_PARMS) -c -g -o $@ $^ + +$(TARGET): $(TARGET).o + $(GCC) $^ $(GCC_LIBS) -o $@ + +clean: + rm -f $(TARGET).o $(TARGET) + +.PHONY: all clean diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md new file mode 100644 index 0000000000..1f96dd365d --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md @@ -0,0 +1,331 @@ + + +# Concurrent inference and dynamic batching + +The purpose of this sample is to demonstrate the important features of Triton Inference Server such as concurrent model execution and dynamic batching. + +We will be using a purpose built deployable people detection model, which we download from [Nvidia GPU Cloud (NGC)](https://ngc.nvidia.com/). + +## Acquiring the model + +Download the pruned [PeopleNet](https://ngc.nvidia.com/catalog/models/nvidia:tlt_peoplenet) model from the NGC. This model is available as a ready-to-use model, and you can download it from NGC using either `wget` method: + +```shell +wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/tao/peoplenet/versions/pruned_v2.1/zip -O pruned_v2.1.zip +``` + +or via CLI command: + +```shell +ngc registry model download-version "nvidia/tao/peoplenet:pruned_v2.1" +``` + +For latter you need to setup the [NGC CLI](https://ngc.nvidia.com/setup). + +Having downloaded the model from the NGC, unzip the archive `peoplenet_pruned_v2.1.zip` into `concurrency_and_dynamic_batching/tao/models/peoplenet`. + +If you have the zip archive in the `concurrency_and_dynamic_batching` directory, the following will automatically place the model to the correct location: + +```shell +unzip pruned_v2.1.zip -d $(pwd)/tao/models/peoplenet +``` + +Verify that you can see the model file `resnet34_peoplenet_pruned.etlt` under + +``` +concurrency_and_dynamic_batching +└── tao +   └── models +   └── peoplenet +   ├── labels.txt +   └── resnet34_peoplenet_pruned.etlt +``` + +## Converting the model to TensorRT + +After you have acquired the model file in `.etlt` format, you will need to convert the model to [TensorRT](https://developer.nvidia.com/tensorrt) format. NVIDIA TensorRT is an SDK for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high throughput for deep learning inference applications. The latest versions of JetPack include TensorRT. + +In order to convert an `.etlt` model to TensorRT format, you need to use the `tao-converter` tool. + +The `tao-converter` tool is available as a compiled release file for different platforms. The download links corresponding to your deployment system are provided among the [TLT Getting Started resources](https://developer.nvidia.com/tlt-get-started). + +After you have downloaded `tao-converter`, you might need to execute + +```shell +chmod 777 tao-converter +``` + +in the directory with the tool. + +We provide a conversion script `tao/convert_peoplenet.sh` which expects the model to be present at the location. + +```shell +tao +└── models + └── peoplenet +``` + +To execute it, you can place the `tao-converter` executable to the `tao` directory of the project and in the same directory run + +```shell +bash convert_peoplenet.sh +``` + +After you execute it, verify that a `model.plan` file was placed to to the directories `/trtis_model_repo_sample_1/peoplenet/1` and `/trtis_model_repo_sample_2/peoplenet/1`. Note that we have two slightly different repositories for the same model to demonstrate different features of Triton. + +Also note that this step has to be performed on the target hardware: if you are planning to execute this application on Jetson, the conversion has to be performed on Jetson. + +To learn more about `tao-converter`parameters, run: + +```shell +./tao-converter -h +``` + +## Building the app + +To compile the sample, pull the following repositories: +* [https://github.com/triton-inference-server/server](https://github.com/triton-inference-server/server) +* [https://github.com/triton-inference-server/core](https://github.com/triton-inference-server/core) + +Make sure you copied the contents of the release you downloaded to `$HOME` + +```shell +sudo cp -rf tritonserver2.x.y-jetpack4.6 $HOME/tritonserver +``` + +Open the terminal in `concurrency_and_dynamic_batching` and build the app executing + +```shell +make +``` + +An example Makefile is provided for Jetson. + +## Demonstration case 1: Concurrent model execution + +With Triton Inference Server, multiple models (or multiple instances of the same model) can run simultaneously on the same GPU or on multiple GPUs. In this example, we are demonstrating how to run multiple instances of the same model on a single Jetson GPU. + +### Running the sample + +To execute from the terminal, run from the `concurrency_and_dynamic_batching` directory: + +```shell +LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_1 -t 6 -s false -p $HOME/tritonserver +``` + +The parameter `-t` controls the number of concurrent inference calls we want to execute. We will be executing the same model on the same sample image with the purpose of demonstrating how setting different concurrency options affects the performance. + +You can enable saving detected bounding boxes in the project directory in form of overlays over the original image for each execution thread. You can turn the visualization on by setting the parameter `-s` to `true` upon execution (`-s` is set to `false` by default). + +### Expected output + +Upon execution, in the terminal log you will see _Model 'peoplenet' Stats_ in json format reflecting the inference performance. We also output _TOTAL INFERENCE TIME_ which simply reflects the elapsed time required to run the application including data loading, pre-processing and post-processing. + +A typical output in the log for _Model 'peoplenet' Stats_ looks as follows: + +```json +{ + "model_stats":[ + { + "name":"peoplenet", + "version":"1", + "last_inference":1626448309997, + "inference_count":6, + "execution_count":6, + "inference_stats":{ + "success":{ + "count":6, + "ns":574589968 + }, + "fail":{ + "count":0, + "ns":0 + }, + "queue":{ + "count":6, + "ns":234669630 + }, + "compute_input":{ + "count":6, + "ns":194884512 + }, + "compute_infer":{ + "count":6, + "ns":97322636 + }, + "compute_output":{ + "count":6, + "ns":47700806 + } + }, + "batch_stats":[ + { + "batch_size":1, + "compute_input":{ + "count":6, + "ns":194884512 + }, + "compute_infer":{ + "count":6, + "ns":97322636 + }, + "compute_output":{ + "count":6, + "ns":47700806 + } + } + ] + } + ] +} + +"TOTAL INFERENCE TIME: 174ms" +``` + +To learn about different statistics check out the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md#statistics-extension). + +To see how setting different values for concurrency affects total execution time and its components reflected in the model stats, you need to modify a single parameter in the model config file. + +To enable concurrent model execution support for a model, corresponding model config file `trtis_model_repo_sample_1/peoplenet/config.pbtxt` includes the following: + +``` +instance_group [ + { + count: 3 + kind: KIND_GPU + } +] +``` + +You can change the count of allowed inferences for the same model instance and observe how it affects performance in _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_. Note that on Jetson we dont recommend setting values too high: for instance, on a device like a Jetson Xavier AGX we don't recommend setting the number larger than 6. The values in the range 1-3 are optimal. + +While trying out different values, note how it affects total inference time as well as some inference statistics (like queue and compute times) + +## Demonstration case 2: Dynamic batching + +For models that support batching, Triton implements multiple scheduling and batching algorithms that combine individual inference requests together to improve inference throughput. In this example, we want to demonstrate how enbling automatic dynamic batching affects inference performance. + +### Running the sample + +To observe the effect of dynamic batching, from the `concurrency_and_dynamic_batching` directory execute: + +```shell +LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_2 -t 6 -s false -p $HOME/tritonserver +``` + +### Expected output + +Take a look at _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_ to see the effect of dynamic batching. A possible outcome should look like that: + +```json +{ + "model_stats":[ + { + "name":"peoplenet", + "version":"1", + "last_inference":1626447787832, + "inference_count":6, + "execution_count":2, + "inference_stats":{ + "success":{ + "count":6, + "ns":558981051 + }, + "fail":{ + "count":0, + "ns":0 + }, + "queue":{ + "count":6, + "ns":49271380 + }, + "compute_input":{ + "count":6, + "ns":170634044 + }, + "compute_infer":{ + "count":6, + "ns":338079193 + }, + "compute_output":{ + "count":6, + "ns":950544 + } + }, + "batch_stats":[ + { + "batch_size":1, + "compute_input":{ + "count":1, + "ns":15955684 + }, + "compute_infer":{ + "count":1, + "ns":29917093 + }, + "compute_output":{ + "count":1, + "ns":152264 + } + }, + { + "batch_size":5, + "compute_input":{ + "count":1, + "ns":30935672 + }, + "compute_infer":{ + "count":1, + "ns":61632420 + }, + "compute_output":{ + "count":1, + "ns":159656 + } + } + ] + } + ] +} + +"TOTAL INFERENCE TIME: 162ms" +``` + +Notice that this time the model was executed only twice (as indicated by `execution_count`). Also, unlike in the previous example, the `batch_stats` part of the statitstics looks different: we see that our model was executed one time with `batch = 1` and the second time with `batch = 5`. It helped to decrease the total inference time. + +In order to enable dynamic batching, the following is present in the model config `trtis_model_repo_sample_2/peoplenet/config.pbtxt`: + +``` +dynamic_batching { +} +``` + +To try further options of dynamic batcher see the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher). + +You can also try enabling both concurrent model execution and dynamic batching. \ No newline at end of file diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/capture.jpg b/docs/examples/jetson/concurrency_and_dynamic_batching/capture.jpg new file mode 100644 index 0000000000..82e2cb38e0 Binary files /dev/null and b/docs/examples/jetson/concurrency_and_dynamic_batching/capture.jpg differ diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/common.h b/docs/examples/jetson/concurrency_and_dynamic_batching/common.h new file mode 100644 index 0000000000..b55c8b71c5 --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/common.h @@ -0,0 +1,106 @@ +// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include + +#include "triton/core/tritonserver.h" + +#define RETURN_IF_ERR(X) \ + do { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + return err__; \ + } \ + } while (false) + +#define RETURN_MSG_IF_ERR(X, MSG) \ + do { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + return TRITONSERVER_ErrorNew( \ + TRITONSERVER_ErrorCode(err__), \ + (std::string(MSG) + ": " + TRITONSERVER_ErrorMessage(err__)) \ + .c_str()); \ + } \ + } while (false) + +#define GOTO_IF_ERR(X, T) \ + do { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + goto T; \ + } \ + } while (false) + +#define FAIL(MSG) \ + do { \ + std::cerr << "error: " << (MSG) << std::endl; \ + exit(1); \ + } while (false) + +#define FAIL_IF_ERR(X, MSG) \ + do { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + std::cerr << "error: " << (MSG) << ": " \ + << TRITONSERVER_ErrorCodeString(err__) << " - " \ + << TRITONSERVER_ErrorMessage(err__) << std::endl; \ + TRITONSERVER_ErrorDelete(err__); \ + exit(1); \ + } \ + } while (false) + +#define IGNORE_ERR(X) \ + do { \ + TRITONSERVER_Error* err__ = (X); \ + if (err__ != nullptr) { \ + TRITONSERVER_ErrorDelete(err__); \ + } \ + } while (false) + +#ifdef TRITON_ENABLE_GPU +#define FAIL_IF_CUDA_ERR(X, MSG) \ + do { \ + cudaError_t err__ = (X); \ + if (err__ != cudaSuccess) { \ + std::cerr << "error: " << (MSG) << ": " << cudaGetErrorString(err__) \ + << std::endl; \ + exit(1); \ + } \ + } while (false) +#endif // TRITON_ENABLE_GPU + +/// Get the integral version from a string, or fail if string does not +/// represent a valid version. +/// +/// \param version_string The string version. +/// \param version Returns the integral version. +/// \return The error status. Failure if 'version_string' doesn't +/// convert to valid version. +TRITONSERVER_Error* GetModelVersionFromString( + const std::string& version_string, int64_t* version); diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/labels.txt b/docs/examples/jetson/concurrency_and_dynamic_batching/labels.txt new file mode 100644 index 0000000000..8ae80671d6 --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/labels.txt @@ -0,0 +1,4 @@ +person +bag +face + diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/people_detection.cc b/docs/examples/jetson/concurrency_and_dynamic_batching/people_detection.cc new file mode 100644 index 0000000000..ce22bdcba9 --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/people_detection.cc @@ -0,0 +1,1158 @@ +// Copyright (c) 2021, NVIDIA CORPORATION& AFFILIATES.All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "opencv2/core.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/opencv.hpp" +#include "triton/core/tritonserver.h" + +#ifdef TRITON_ENABLE_GPU +#include +#endif // TRITON_ENABLE_GPU + +namespace { + +bool enforce_memory_type = false; +TRITONSERVER_MemoryType requested_memory_type; + +#ifdef TRITON_ENABLE_GPU +static auto cuda_data_deleter = [](void* data) { + if (data != nullptr) { + cudaPointerAttributes attr; + auto cuerr = cudaPointerGetAttributes(&attr, data); + if (cuerr != cudaSuccess) { + std::cerr << "error: failed to get CUDA pointer attribute of " << data + << ": " << cudaGetErrorString(cuerr) << std::endl; + } + if (attr.type == cudaMemoryTypeDevice) { + cuerr = cudaFree(data); + } else if (attr.type == cudaMemoryTypeHost) { + cuerr = cudaFreeHost(data); + } + if (cuerr != cudaSuccess) { + std::cerr << "error: failed to release CUDA pointer " << data << ": " + << cudaGetErrorString(cuerr) << std::endl; + } + } +}; +#endif // TRITON_ENABLE_GPU + +void +Usage(char** argv, const std::string& msg = std::string()) +{ + if (!msg.empty()) { + std::cerr << msg << std::endl; + } + + std::cerr << "Usage: " << argv[0] << " [options]" << std::endl; + std::cerr << "\t-m <\"system\"|\"pinned\"|gpu>" + << " Enforce the memory type for input and output tensors." + << " If not specified, inputs will be in system memory and outputs" + << " will be based on the model's preferred type." << std::endl; + std::cerr << "\t-v Enable verbose logging." << std::endl; + std::cerr + << "\t-t Thread count to simulate the number of concurrent requests." + << std::endl; + std::cerr << "\t-r [model repository absolute path]." << std::endl; + std::cerr << "\t-p [tritonserver path]." << std::endl; + std::cerr << "\t-s ." + << " Specify whether output visualizations will be saved to the " + "project folder." + << " If not specified, no outputs will be saved." << std::endl; + + exit(1); +} + +TRITONSERVER_Error* +ResponseAlloc( + TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name, + size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type, + int64_t preferred_memory_type_id, void* userp, void** buffer, + void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type, + int64_t* actual_memory_type_id) +{ + // Initially attempt to make the actual memory type and id that we + // allocate be the same as preferred memory type + *actual_memory_type = preferred_memory_type; + *actual_memory_type_id = preferred_memory_type_id; + + // If 'byte_size' is zero just return 'buffer' == nullptr, we don't + // need to do any other book-keeping. + if (byte_size == 0) { + *buffer = nullptr; + *buffer_userp = nullptr; + std::cout << "allocated " << byte_size << " bytes for result tensor " + << tensor_name << std::endl; + } else { + void* allocated_ptr = nullptr; + if (enforce_memory_type) { + *actual_memory_type = requested_memory_type; + } + + switch (*actual_memory_type) { +#ifdef TRITON_ENABLE_GPU + case TRITONSERVER_MEMORY_CPU_PINNED: { + auto err = cudaSetDevice(*actual_memory_type_id); + if ((err != cudaSuccess) && (err != cudaErrorNoDevice) && + (err != cudaErrorInsufficientDriver)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "unable to recover current CUDA device: " + + std::string(cudaGetErrorString(err))) + .c_str()); + } + + err = cudaHostAlloc(&allocated_ptr, byte_size, cudaHostAllocPortable); + if (err != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "cudaHostAlloc failed: " + + std::string(cudaGetErrorString(err))) + .c_str()); + } + break; + } + + case TRITONSERVER_MEMORY_GPU: { + auto err = cudaSetDevice(*actual_memory_type_id); + if ((err != cudaSuccess) && (err != cudaErrorNoDevice) && + (err != cudaErrorInsufficientDriver)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "unable to recover current CUDA device: " + + std::string(cudaGetErrorString(err))) + .c_str()); + } + + err = cudaMalloc(&allocated_ptr, byte_size); + if (err != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + std::string( + "cudaMalloc failed: " + std::string(cudaGetErrorString(err))) + .c_str()); + } + break; + } +#endif // TRITON_ENABLE_GPU + + // Use CPU memory if the requested memory type is unknown + // (default case). + case TRITONSERVER_MEMORY_CPU: + default: { + *actual_memory_type = TRITONSERVER_MEMORY_CPU; + allocated_ptr = malloc(byte_size); + break; + } + } + + // Pass the tensor name with buffer_userp so we can show it when + // releasing the buffer. + if (allocated_ptr != nullptr) { + *buffer = allocated_ptr; + *buffer_userp = new std::string(tensor_name); + std::cout << "allocated " << byte_size << " bytes in " + << TRITONSERVER_MemoryTypeString(*actual_memory_type) + << " for result tensor " << tensor_name << std::endl; + } + } + + return nullptr; // Success +} + +TRITONSERVER_Error* +ResponseRelease( + TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp, + size_t byte_size, TRITONSERVER_MemoryType memory_type, + int64_t memory_type_id) +{ + std::string* name = nullptr; + if (buffer_userp != nullptr) { + name = reinterpret_cast(buffer_userp); + } else { + name = new std::string(""); + } + + std::cout << "Releasing buffer " << buffer << " of size " << byte_size + << " in " << TRITONSERVER_MemoryTypeString(memory_type) + << " for result '" << *name << "'" << std::endl; + switch (memory_type) { + case TRITONSERVER_MEMORY_CPU: + free(buffer); + break; +#ifdef TRITON_ENABLE_GPU + case TRITONSERVER_MEMORY_CPU_PINNED: { + auto err = cudaSetDevice(memory_type_id); + if (err == cudaSuccess) { + err = cudaFreeHost(buffer); + } + if (err != cudaSuccess) { + std::cerr << "error: failed to cudaFree " << buffer << ": " + << cudaGetErrorString(err) << std::endl; + } + break; + } + case TRITONSERVER_MEMORY_GPU: { + auto err = cudaSetDevice(memory_type_id); + if (err == cudaSuccess) { + err = cudaFree(buffer); + } + if (err != cudaSuccess) { + std::cerr << "error: failed to cudaFree " << buffer << ": " + << cudaGetErrorString(err) << std::endl; + } + break; + } +#endif // TRITON_ENABLE_GPU + default: + std::cerr << "error: unexpected buffer allocated in CUDA managed memory" + << std::endl; + break; + } + + delete name; + + return nullptr; // Success +} + +void +InferRequestComplete( + TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) +{ + // We reuse the request so we don't delete it here. +} + +void +InferResponseComplete( + TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp) +{ + if (response != nullptr) { + // Send 'response' to the future. + std::promise* p = + reinterpret_cast*>(userp); + p->set_value(response); + delete p; + } +} + + +TRITONSERVER_Error* +ParseModelMetadata(const rapidjson::Document& model_metadata) +{ + std::string seen_data_type; + for (const auto& input : model_metadata["inputs"].GetArray()) { + if (strcmp(input["datatype"].GetString(), "FP32")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "this example only supports model with data type FP32"); + } + if (seen_data_type.empty()) { + seen_data_type = input["datatype"].GetString(); + } else if (strcmp(seen_data_type.c_str(), input["datatype"].GetString())) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "the inputs and outputs of this model must have the data type"); + } + } + for (const auto& output : model_metadata["outputs"].GetArray()) { + if (strcmp(output["datatype"].GetString(), "FP32")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "this example only supports model with data type FP32"); + } else if (strcmp(seen_data_type.c_str(), output["datatype"].GetString())) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "the inputs and outputs of this model must have the data type"); + } + } + + return nullptr; +} + + +cv::Mat +ResizeKeepAspectRatio( + const cv::Mat& input, const cv::Size& dstSize, const cv::Scalar& bgcolor, + bool& fixHeight, float& ratio, int& sideCache) +{ + cv::Mat output; + + double h1 = dstSize.width * (input.rows / (double)input.cols); + double w2 = dstSize.height * (input.cols / (double)input.rows); + if (h1 <= dstSize.height) { + cv::resize(input, output, cv::Size(dstSize.width, h1)); + ratio = (float)dstSize.width / input.cols; + fixHeight = false; + sideCache = (int)(ratio * input.rows); + std::cout << "Resizing to fixed width. Ratio " << ratio << std::endl; + std::cout << "Height cache " << sideCache << std::endl; + } else { + cv::resize(input, output, cv::Size(w2, dstSize.height)); + ratio = (float)dstSize.height / input.rows; + fixHeight = true; + sideCache = (int)(ratio * input.cols); + std::cout << "Resizing to fixed height. Ratio " << ratio << std::endl; + std::cout << "Width cache " << sideCache << std::endl; + } + + int top = (dstSize.height - output.rows) / 2; + int down = (dstSize.height - output.rows + 1) / 2; + int left = (dstSize.width - output.cols) / 2; + int right = (dstSize.width - output.cols + 1) / 2; + + cv::copyMakeBorder( + output, output, top, down, left, right, cv::BORDER_CONSTANT, bgcolor); + + return output; +} + + +void +SaveOverlay( + std::vector& bboxes_list, std::vector& indexes, + std::vector& input0_shape, bool& fixHeight, float& ratio, + int& sideCache, std::string imageName, size_t& thread_id) +{ + const int inputC = input0_shape[1]; + const int inputH = input0_shape[2]; + const int inputW = input0_shape[3]; + + cv::Mat image = cv::imread(imageName); + + cv::Scalar color = cv::Scalar(0, 255, 0); + + int xmin, ymin, xmax, ymax; + + for (auto i : indexes) { + xmin = bboxes_list[i].x; + ymin = bboxes_list[i].y; + xmax = bboxes_list[i].x + bboxes_list[i].width; + ymax = bboxes_list[i].y + bboxes_list[i].height; + + if (fixHeight) { + xmin = int((xmin - (inputW - sideCache) / 2) / ratio); + xmax = int((xmax - (inputW - sideCache) / 2) / ratio); + ymin = int(ymin / ratio); + ymax = int(ymax / ratio); + } else { + ymin = int((ymin - (inputH - sideCache) / 2) / ratio); + ymax = int((ymax - (inputH - sideCache) / 2) / ratio); + xmin = int(xmin / ratio); + xmax = int(xmax / ratio); + } + cv::Point p1(xmin, ymin); + cv::Point p2(xmax, ymax); + cv::rectangle(image, p1, p2, color, 4); + } + + std::string outName = "capture_overlay_" + std::to_string(thread_id) + ".jpg"; + imwrite(outName, image); +} + + +void +Normalize(cv::Mat img, std::vector*& data, int inputC) +{ + for (int c = 0; c < inputC; ++c) { + for (int i = 0; i < img.rows; ++i) { + cv::Vec3b* p1 = img.ptr(i); + for (int j = 0; j < img.cols; ++j) { + ((float*)data->data())[c * img.cols * img.rows + i * img.cols + j] = + p1[j][c] / 255.f; + } + } + } +} + + +void +RecoverBoundingBoxes( + std::unordered_map>& output_data, + std::unordered_map& shapes, + std::vector& input0_shape, std::vector& bboxes_list, + std::vector& scores_list, std::vector& indexes) +{ + const float box_scale = 35.f; + const float box_offset = 0.5f; + const float score_threshold = 0.5f; + const float nms_threshold = 0.5f; + + int gridH = shapes["output_cov/Sigmoid"][2]; + int gridW = shapes["output_cov/Sigmoid"][3]; + + std::cout << "gridH: " << gridH << std::endl; + std::cout << "gridW: " << gridW << std::endl; + + int modelH = input0_shape[2]; + int modelW = input0_shape[3]; + int batch = input0_shape[0]; + + std::cout << "batch: " << batch << std::endl; + std::cout << "modelH: " << modelH << std::endl; + std::cout << "modelW: " << modelW << std::endl; + + int cellH = modelH / gridH; + int cellW = modelW / gridW; + + for (int b = 0; b < batch; b++) { + for (int h = 0; h < gridH; h++) { + for (int w = 0; w < gridW; w++) { + // value(n, c, h, w) = n * CHW + c * HW + h * W + w + int idx = b * gridH * gridW + h * gridW + w; + float val = output_data["output_cov/Sigmoid"][idx]; + if (val > score_threshold) { + scores_list.push_back(val); + + // location of the w, h coordinate in the original image + int mx = w * cellW; + int my = h * cellH; + + // scale the detected coordinates to original and return their + // location in the image + int idxX1 = b * 3 * gridH * gridW + 0 * gridH * gridW + h * gridW + w; + int idxY1 = b * 3 * gridH * gridW + 1 * gridH * gridW + h * gridW + w; + int idxX2 = b * 3 * gridH * gridW + 2 * gridH * gridW + h * gridW + w; + int idxY2 = b * 3 * gridH * gridW + 3 * gridH * gridW + h * gridW + w; + + int rectX1 = + -(output_data["output_bbox/BiasAdd"][idxX1] + box_offset) * + box_scale + + mx; + int rectY1 = + -(output_data["output_bbox/BiasAdd"][idxY1] + box_offset) * + box_scale + + my; + int rectX2 = + (output_data["output_bbox/BiasAdd"][idxX2] + box_offset) * + box_scale + + mx; + int rectY2 = + (output_data["output_bbox/BiasAdd"][idxY2] + box_offset) * + box_scale + + my; + + // Rect ROI (x, y, width, height); + cv::Rect bbox(rectX1, rectY1, rectX2 - rectX1, rectY2 - rectY1); + bboxes_list.push_back(bbox); + } + } + } + } + + // Execute non-maximum suppression + cv::dnn::NMSBoxes( + bboxes_list, scores_list, score_threshold, nms_threshold, indexes); +} + +void +ParseDetections( + TRITONSERVER_InferenceResponse* response, const std::string& output0, + const std::string& output1, + std::unordered_map>& output_data, + std::unordered_map& shapes) +{ + uint32_t output_count; + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutputCount(response, &output_count), + "getting number of response outputs"); + if (output_count != 2) { + FAIL("expecting 2 response outputs, got " + std::to_string(output_count)); + } + + for (uint32_t idx = 0; idx < output_count; ++idx) { + const char* cname; + TRITONSERVER_DataType datatype; + const int64_t* shape; + uint64_t dim_count; + const void* base; + size_t byte_size; + TRITONSERVER_MemoryType memory_type; + int64_t memory_type_id; + void* userp; + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutput( + response, idx, &cname, &datatype, &shape, &dim_count, &base, + &byte_size, &memory_type, &memory_type_id, &userp), + "getting output info"); + + if (cname == nullptr) { + FAIL("unable to get output name"); + } + + std::string name(cname); + if ((name != output0) && (name != output1)) { + FAIL("unexpected output '" + name + "'"); + } + + shapes[name] = shape; + + std::vector& odata = output_data[name]; + + switch (memory_type) { + case TRITONSERVER_MEMORY_CPU: { + std::cout << std::endl + << name << " is stored in system memory" << std::endl; + const float* cbase = reinterpret_cast(base); + odata.assign(cbase, cbase + byte_size / sizeof(float)); + break; + } + + case TRITONSERVER_MEMORY_CPU_PINNED: { + std::cout << std::endl + << name << " is stored in pinned memory" << std::endl; + const float* cbase = reinterpret_cast(base); + odata.assign(cbase, cbase + byte_size / sizeof(float)); + break; + } + +#ifdef TRITON_ENABLE_GPU + case TRITONSERVER_MEMORY_GPU: { + std::cout << std::endl + << name << " is stored in GPU memory" << std::endl; + odata.reserve(byte_size); + FAIL_IF_CUDA_ERR( + cudaMemcpy(&odata[0], base, byte_size, cudaMemcpyDeviceToHost), + "getting " + name + " data from GPU memory"); + break; + } +#endif + + default: + FAIL("unexpected memory type"); + } + } +} + +void +DetectionInferenceOutput( + std::vector& result_indexes, std::vector& bboxes_list, + TRITONSERVER_InferenceResponse* completed_response, + const std::string& output0, const std::string& output1, + std::vector& input0_shape, bool& fixHeight, float& ratio, + int& sideCache, size_t& thread_id, bool visualize = false, + std::string imageName = "capture.jpg") +{ + // Parse outputs + std::unordered_map> output_data; + std::unordered_map shapes; + ParseDetections(completed_response, output0, output1, output_data, shapes); + + std::vector scores_list; + RecoverBoundingBoxes( + output_data, shapes, input0_shape, bboxes_list, scores_list, + result_indexes); + + std::cout << "Detection finished. Indexes of detected objects: " << std::endl; + for (auto idx : result_indexes) { + std::cout << idx << std::endl; + std::cout << bboxes_list[idx] << std::endl; + } + + if (visualize) + SaveOverlay( + bboxes_list, result_indexes, input0_shape, fixHeight, ratio, sideCache, + imageName, thread_id); +} + + +} // namespace + + +void +SetServerOptions( + TRITONSERVER_ServerOptions** server_options, bool verbose_level, + std::string model_repository_path, std::string tritonserver_path) +{ + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsNew(server_options), "creating server options"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetModelRepositoryPath( + *server_options, model_repository_path.c_str()), + "setting model repository path"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetLogVerbose(*server_options, verbose_level), + "setting verbose logging level"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetMetrics(*server_options, true), + "failed to enable metrics"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetStrictReadiness(*server_options, true), + "failed to set strict readiness"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetStrictModelConfig(*server_options, true), + "failed to set strict model config"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetModelControlMode( + *server_options, TRITONSERVER_MODEL_CONTROL_EXPLICIT), + "failed to set model control mode to explicit"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetBackendDirectory( + *server_options, (tritonserver_path + "/backends").c_str()), + "setting backend directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetRepoAgentDirectory( + *server_options, (tritonserver_path + "/repoagents").c_str()), + "setting repository agent directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetStrictModelConfig(*server_options, true), + "setting strict model configuration"); +#ifdef TRITON_ENABLE_GPU + double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY; +#else + double min_compute_capability = 0; +#endif // TRITON_ENABLE_GPU + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability( + *server_options, min_compute_capability), + "setting minimum supported CUDA compute capability"); +} + + +void +CheckServerLiveAndReady(std::shared_ptr server) +{ + size_t wait_seconds = 0; + while (true) { + bool live, ready; + FAIL_IF_ERR( + TRITONSERVER_ServerIsLive(server.get(), &live), + "unable to get server liveness"); + FAIL_IF_ERR( + TRITONSERVER_ServerIsReady(server.get(), &ready), + "unable to get server readiness"); + std::cout << "Server Health: live " << live << ", ready " << ready + << std::endl; + if (live && ready) { + break; + } + + if (++wait_seconds >= 10) { + FAIL("failed to find healthy inference server"); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } +} + + +void +PrintServerStatus(std::shared_ptr server) +{ + TRITONSERVER_Message* server_metadata_message; + FAIL_IF_ERR( + TRITONSERVER_ServerMetadata(server.get(), &server_metadata_message), + "unable to get server metadata message"); + const char* buffer; + size_t byte_size; + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + server_metadata_message, &buffer, &byte_size), + "unable to serialize server metadata message"); + + std::cout << "Server Status:" << std::endl; + std::cout << std::string(buffer, byte_size) << std::endl; + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(server_metadata_message), + "deleting status metadata"); +} + + +void +AwaitModelReady( + std::shared_ptr server, const std::string model_name) +{ + bool is_ready = false; + size_t wait_seconds = 0; + while (!is_ready) { + FAIL_IF_ERR( + TRITONSERVER_ServerModelIsReady( + server.get(), model_name.c_str(), 1, &is_ready), + "unable to get model readiness"); + if (!is_ready) { + if (++wait_seconds >= 5) { + FAIL("model failed to be ready in 5 seconds"); + } + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + continue; + } + + TRITONSERVER_Message* model_metadata_message; + FAIL_IF_ERR( + TRITONSERVER_ServerModelMetadata( + server.get(), model_name.c_str(), 1, &model_metadata_message), + "unable to get model metadata message"); + const char* buffer; + size_t byte_size; + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + model_metadata_message, &buffer, &byte_size), + "unable to serialize model status protobuf"); + + rapidjson::Document model_metadata; + model_metadata.Parse(buffer, byte_size); + if (model_metadata.HasParseError()) { + FAIL( + "error: failed to parse model metadata from JSON: " + + std::string(GetParseError_En(model_metadata.GetParseError())) + + " at " + std::to_string(model_metadata.GetErrorOffset())); + } + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(model_metadata_message), + "deleting status protobuf"); + + if (strcmp(model_metadata["name"].GetString(), model_name.c_str())) { + FAIL("unable to find metadata for model"); + } + + bool found_version = false; + if (model_metadata.HasMember("versions")) { + for (const auto& version : model_metadata["versions"].GetArray()) { + if (strcmp(version.GetString(), "1") == 0) { + found_version = true; + break; + } + } + } + if (!found_version) { + FAIL("unable to find version 1 status for model"); + } + + FAIL_IF_ERR(ParseModelMetadata(model_metadata), "parsing model metadata"); + } +} + + +void +LoadInputImageFromFile( + cv::Mat& dst, std::vector& input0_shape, bool& fixHeight, + float& ratio, int& sideCache, std::string imageName = "capture.jpg") +{ + const int inputC = input0_shape[1]; + const int inputH = input0_shape[2]; + const int inputW = input0_shape[3]; + const int batchSize = input0_shape[0]; + + cv::Mat image = cv::imread(imageName); + + if (image.empty()) { + std::cout << "Cannot open image " << imageName << std::endl; + exit(0); + } + + // resize keeping aspect ratio and pad + dst = ResizeKeepAspectRatio( + image, cv::Size(inputW, inputH), cv::Scalar(0, 0, 0), fixHeight, ratio, + sideCache); + + cv::cvtColor(dst, dst, cv::COLOR_BGR2RGB); +} + + +void +LoadInputData( + cv::Mat& dst, std::vector* input0_data, + std::vector& input0_shape) +{ + const int inputC = input0_shape[1]; + const int inputH = input0_shape[2]; + const int inputW = input0_shape[3]; + + input0_data->resize(inputC * inputH * inputW * sizeof(float)); + + // normalize + Normalize(dst, input0_data, inputC); +} + +static std::mutex mutex; + +void +RunInferenceAndValidate( + std::shared_ptr server, + TRITONSERVER_ResponseAllocator* allocator, cv::Mat scaled_input_image, + bool fixHeight, float ratio, int sideCache, std::string model_name, + size_t thread_id, bool visualize) +{ + TRITONSERVER_InferenceRequest* irequest = nullptr; + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestNew( + &irequest, server.get(), model_name.c_str(), -1), + "creating inference request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"), + "setting ID for the request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetReleaseCallback( + irequest, InferRequestComplete, nullptr), + "setting request release callback"); + + // Inputs + auto input0 = "input_1"; + std::vector input0_shape({1, 3, 544, 960}); + + const TRITONSERVER_DataType datatype = TRITONSERVER_TYPE_FP32; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddInput( + irequest, input0, datatype, &input0_shape[0], input0_shape.size()), + "setting input 0 meta-data for the request"); + + // Outputs + auto output0 = "output_bbox/BiasAdd"; + auto output1 = "output_cov/Sigmoid"; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0), + "requesting output 0 for the request"); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output1), + "requesting output 1 for the request"); + + // Load the input data + std::vector input0_data; + std::vector result_indexes; + std::vector bboxes_list; + + LoadInputData(scaled_input_image, &input0_data, input0_shape); + + size_t input0_size = input0_data.size(); + + const void* input0_base = &input0_data[0]; + +#ifdef TRITON_ENABLE_GPU + std::unique_ptr input0_gpu( + nullptr, cuda_data_deleter); + bool use_cuda_memory = + (enforce_memory_type && + (requested_memory_type != TRITONSERVER_MEMORY_CPU)); + if (use_cuda_memory) { + FAIL_IF_CUDA_ERR(cudaSetDevice(0), "setting CUDA device to device 0"); + if (requested_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) { + void* dst; + FAIL_IF_CUDA_ERR( + cudaMalloc(&dst, input0_size), + "allocating GPU memory for INPUT0 data"); + input0_gpu.reset(dst); + FAIL_IF_CUDA_ERR( + cudaMemcpy(dst, &input0_data[0], input0_size, cudaMemcpyHostToDevice), + "setting INPUT0 data in GPU memory"); + } else { + void* dst; + FAIL_IF_CUDA_ERR( + cudaHostAlloc(&dst, input0_size, cudaHostAllocPortable), + "allocating pinned memory for INPUT0 data"); + input0_gpu.reset(dst); + FAIL_IF_CUDA_ERR( + cudaMemcpy(dst, &input0_data[0], input0_size, cudaMemcpyHostToHost), + "setting INPUT0 data in pinned memory"); + } + } + + input0_base = use_cuda_memory ? input0_gpu.get() : &input0_data[0]; +#endif // TRITON_ENABLE_GPU + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAppendInputData( + irequest, input0, input0_base, input0_size, requested_memory_type, 0), + "assigning INPUT0 data"); + + // Perform inference... + { + auto p = new std::promise(); + std::future completed = p->get_future(); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetResponseCallback( + irequest, allocator, nullptr, InferResponseComplete, + reinterpret_cast(p)), + "setting response callback"); + + FAIL_IF_ERR( + TRITONSERVER_ServerInferAsync(server.get(), irequest, nullptr), + "running inference"); + + // Wait for the inference to complete. + TRITONSERVER_InferenceResponse* completed_response = completed.get(); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseError(completed_response), + "response status"); + + std::unique_lock lock(mutex); + + // Process output + DetectionInferenceOutput( + result_indexes, bboxes_list, completed_response, output0, output1, + input0_shape, fixHeight, ratio, sideCache, thread_id, visualize); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseDelete(completed_response), + "deleting inference response"); + } + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestDelete(irequest), + "deleting inference request"); +} + + +void +PrintModelStats( + std::shared_ptr server, const std::string model_name) +{ + TRITONSERVER_Message* model_stats_message = nullptr; + + FAIL_IF_ERR( + TRITONSERVER_ServerModelStatistics( + server.get(), model_name.c_str(), -1 /* model_version */, + &model_stats_message), + "unable to get model stats message"); + const char* buffer; + size_t byte_size; + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + model_stats_message, &buffer, &byte_size), + "unable to serialize server metadata message"); + + std::cout << "Model '" << model_name << "' Stats:" << std::endl; + std::cout << std::string(buffer, byte_size) << std::endl; + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(model_stats_message), + "deleting model stats message"); +} + + +void +CreateAndRunTritonserverInstance( + std::string model_repository_path, std::string tritonserver_path, + bool verbose_level, int thread_count, bool visualize) +{ + TRITONSERVER_ServerOptions* server_options = nullptr; + + SetServerOptions( + &server_options, verbose_level, model_repository_path, tritonserver_path); + + TRITONSERVER_Server* server_ptr = nullptr; + + FAIL_IF_ERR( + TRITONSERVER_ServerNew(&server_ptr, server_options), + "creating server instance. "); + + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsDelete(server_options), + "deleting server options"); + + std::shared_ptr server( + server_ptr, TRITONSERVER_ServerDelete); + + // Wait and until the server is both live and ready. + CheckServerLiveAndReady(server); + + // Print status of the servers. + PrintServerStatus(server); + std::string model = "peoplenet"; + + // Load models in server. + FAIL_IF_ERR( + TRITONSERVER_ServerLoadModel(server.get(), model.c_str()), + "failed to load model peoplenet"); + + // Wait for the models to become available. + AwaitModelReady(server, model.c_str()); + + // Create the allocator that will be used to allocate buffers for + // the result tensors. + TRITONSERVER_ResponseAllocator* allocator = nullptr; + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorNew( + &allocator, ResponseAlloc, ResponseRelease, nullptr /* start_fn */), + "creating response allocator"); + + + // Measure total execution time + using std::chrono::duration; + using std::chrono::duration_cast; + using std::chrono::high_resolution_clock; + using std::chrono::milliseconds; + + cv::Mat scaled_input_image; + bool fixHeight; + float ratio; + int sideCache; + std::vector input0_shape({1, 3, 544, 960}); + + // the input image is loaded only once and used for all inferences + LoadInputImageFromFile( + scaled_input_image, input0_shape, fixHeight, ratio, sideCache); + + auto t1 = high_resolution_clock::now(); + + // Multi-thread inference + std::thread inferences[thread_count]; + for (size_t i = 0; i < thread_count; i++) { + inferences[i] = std::thread( + &RunInferenceAndValidate, server, allocator, scaled_input_image, + fixHeight, ratio, sideCache, model.c_str(), i, visualize); + } + + for (int i = 0; i < thread_count; ++i) { + inferences[i].join(); + } + + // Second time point to measure elapsed time + auto t2 = high_resolution_clock::now(); + + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorDelete(allocator), + "deleting response allocator"); + + // Print Model Statistics for all models + PrintModelStats(server, model.c_str()); + + // Unload models in the servers. + FAIL_IF_ERR( + TRITONSERVER_ServerUnloadModel(server.get(), model.c_str()), + "failed to unload model"); + + /* Getting number of milliseconds as an integer. */ + auto ms_int = duration_cast(t2 - t1); + + std::cout << "\n TOTAL INFERENCE TIME: " << ms_int.count() << "ms\n"; +} + + +int +main(int argc, char** argv) +{ + std::string model_repository_path; + std::string tritonserver_path; + int verbose_level = 0; + int thread_count = 2; + bool visualize = false; + + // Parse commandline... + int opt; + while ((opt = getopt(argc, argv, "vm:r:p:t:s:")) != -1) { + switch (opt) { + case 'm': { + enforce_memory_type = true; + if (!strcmp(optarg, "system")) { + requested_memory_type = TRITONSERVER_MEMORY_CPU; + } else if (!strcmp(optarg, "pinned")) { + requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; + } else if (!strcmp(optarg, "gpu")) { + requested_memory_type = TRITONSERVER_MEMORY_GPU; + } else { + Usage( + argv, + "-m must be used to specify one of the following types:" + " <\"system\"|\"pinned\"|gpu>"); + } + break; + } + case 'r': + model_repository_path = optarg; + break; + case 'p': + tritonserver_path = optarg; + break; + case 'v': + verbose_level = 1; + break; + case 't': + thread_count = std::stoi(optarg); + break; + case 's': + if (!strcmp(optarg, "true")) { + visualize = true; + } else if (!strcmp(optarg, "false")) { + visualize = false; + } else { + Usage( + argv, + "-s must be:" + " "); + } + break; + case '?': + Usage(argv); + break; + } + } + + if (thread_count < 1) { + Usage(argv, "thread_count must be >= 1"); + } + + if (model_repository_path.empty()) { + Usage(argv, "-r must be used to specify model repository path"); + } +#ifndef TRITON_ENABLE_GPU + if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) { + Usage(argv, "-m can only be set to \"system\" without enabling GPU"); + } +#endif // TRITON_ENABLE_GPU + + // Check API version. + uint32_t api_version_major, api_version_minor; + FAIL_IF_ERR( + TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor), + "getting Triton API version"); + if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major) || + (TRITONSERVER_API_VERSION_MINOR > api_version_minor)) { + FAIL("triton server API version mismatch"); + } + + CreateAndRunTritonserverInstance( + model_repository_path, tritonserver_path, verbose_level, thread_count, + visualize); + + return 0; +} diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh b/docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh new file mode 100755 index 0000000000..5c69680eee --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +./tao-converter \ + -k tlt_encode \ + -d 3,544,960 \ + -i nchw \ + -t fp16 \ + -b 16 \ + -m 64 \ + -o output_cov/Sigmoid,output_bbox/BiasAdd \ + -e ../trtis_model_repo_sample_1/peoplenet/1/model.plan \ + models/peoplenet/resnet34_peoplenet_pruned.etlt + +cp ../trtis_model_repo_sample_1/peoplenet/1/model.plan ../trtis_model_repo_sample_2/peoplenet/1/model.plan + diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/empty_config/config.pbtxt b/docs/examples/jetson/concurrency_and_dynamic_batching/tao/models/peoplenet/.gitkeep similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/empty_config/config.pbtxt rename to docs/examples/jetson/concurrency_and_dynamic_batching/tao/models/peoplenet/.gitkeep diff --git a/src/servables/tensorrt/testdata/autofill_sanity/empty_config/config.pbtxt b/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_1/peoplenet/1/.gitkeep similarity index 100% rename from src/servables/tensorrt/testdata/autofill_sanity/empty_config/config.pbtxt rename to docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_1/peoplenet/1/.gitkeep diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_1/peoplenet/config.pbtxt b/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_1/peoplenet/config.pbtxt new file mode 100644 index 0000000000..75532dee5f --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_1/peoplenet/config.pbtxt @@ -0,0 +1,28 @@ +name: "peoplenet" +platform: "tensorrt_plan" +max_batch_size: 64 +input [ + { + name: "input_1" + data_type: TYPE_FP32 + dims: [ 3, 544, 960 ] + } +] +output [ + { + name: "output_bbox/BiasAdd" + data_type: TYPE_FP32 + dims: [ 12, 34, 60 ] + }, + { + name: "output_cov/Sigmoid" + data_type: TYPE_FP32 + dims: [ 3, 34, 60 ] + } +] +instance_group [ + { + count: 3 + kind: KIND_GPU + } +] diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_2/peoplenet/1/.gitkeep b/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_2/peoplenet/1/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_2/peoplenet/config.pbtxt b/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_2/peoplenet/config.pbtxt new file mode 100644 index 0000000000..9a03913963 --- /dev/null +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/trtis_model_repo_sample_2/peoplenet/config.pbtxt @@ -0,0 +1,25 @@ +name: "peoplenet" +platform: "tensorrt_plan" +max_batch_size: 64 +input [ + { + name: "input_1" + data_type: TYPE_FP32 + dims: [ 3, 544, 960 ] + } +] +output [ + { + name: "output_bbox/BiasAdd" + data_type: TYPE_FP32 + dims: [ 12, 34, 60 ] + }, + { + name: "output_cov/Sigmoid" + data_type: TYPE_FP32 + dims: [ 3, 34, 60 ] + } +] +dynamic_batching { +} + diff --git a/docs/examples/model_repository/densenet_onnx/config.pbtxt b/docs/examples/model_repository/densenet_onnx/config.pbtxt new file mode 100644 index 0000000000..62f84d1bd9 --- /dev/null +++ b/docs/examples/model_repository/densenet_onnx/config.pbtxt @@ -0,0 +1,21 @@ +name: "densenet_onnx" +platform: "onnxruntime_onnx" +max_batch_size : 0 +input [ + { + name: "data_0" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 3, 224, 224 ] + reshape { shape: [ 1, 3, 224, 224 ] } + } +] +output [ + { + name: "fc6_1" + data_type: TYPE_FP32 + dims: [ 1000 ] + reshape { shape: [ 1, 1000, 1, 1 ] } + label_filename: "densenet_labels.txt" + } +] \ No newline at end of file diff --git a/docs/examples/model_repository/resnet50_netdef/resnet50_labels.txt b/docs/examples/model_repository/densenet_onnx/densenet_labels.txt similarity index 100% rename from docs/examples/model_repository/resnet50_netdef/resnet50_labels.txt rename to docs/examples/model_repository/densenet_onnx/densenet_labels.txt diff --git a/docs/examples/model_repository/inception_graphdef/config.pbtxt b/docs/examples/model_repository/inception_graphdef/config.pbtxt index e7a01bcb20..1636d56f77 100644 --- a/docs/examples/model_repository/inception_graphdef/config.pbtxt +++ b/docs/examples/model_repository/inception_graphdef/config.pbtxt @@ -17,9 +17,3 @@ output [ label_filename: "inception_labels.txt" } ] -instance_group [ - { - kind: KIND_GPU, - count: 4 - } -] diff --git a/docs/examples/model_repository/resnet50_netdef/config.pbtxt b/docs/examples/model_repository/resnet50_netdef/config.pbtxt deleted file mode 100644 index 5935b6293e..0000000000 --- a/docs/examples/model_repository/resnet50_netdef/config.pbtxt +++ /dev/null @@ -1,25 +0,0 @@ -name: "resnet50_netdef" -platform: "caffe2_netdef" -max_batch_size: 128 -input [ - { - name: "gpu_0/data" - data_type: TYPE_FP32 - format: FORMAT_NCHW - dims: [ 3, 224, 224 ] - } -] -output [ - { - name: "gpu_0/softmax" - data_type: TYPE_FP32 - dims: [ 1000 ] - label_filename: "resnet50_labels.txt" - } -] -instance_group [ - { - kind: KIND_GPU, - count: 4 - } -] diff --git a/docs/examples/model_repository/simple_dyna_sequence/1/model.graphdef b/docs/examples/model_repository/simple_dyna_sequence/1/model.graphdef new file mode 100755 index 0000000000..7dbacf70b4 Binary files /dev/null and b/docs/examples/model_repository/simple_dyna_sequence/1/model.graphdef differ diff --git a/docs/examples/model_repository/simple_dyna_sequence/config.pbtxt b/docs/examples/model_repository/simple_dyna_sequence/config.pbtxt new file mode 100644 index 0000000000..47889f1f7c --- /dev/null +++ b/docs/examples/model_repository/simple_dyna_sequence/config.pbtxt @@ -0,0 +1,101 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple_dyna_sequence" +platform: "tensorflow_graphdef" +max_batch_size: 8 +sequence_batching { + max_sequence_idle_microseconds: 10000000 + oldest { + max_candidate_sequences: 1024 + max_queue_delay_microseconds: 10000 + } + + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "CORRID" + control [ + { + kind: CONTROL_SEQUENCE_CORRID + data_type: TYPE_UINT64 + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +parameters [ + { + key: "execute_delay_ms" + value: { string_value: "3" } + } +] +instance_group [ + { + count: 2 + kind: KIND_CPU + } +] diff --git a/docs/examples/model_repository/simple_identity/1/model.savedmodel/saved_model.pb b/docs/examples/model_repository/simple_identity/1/model.savedmodel/saved_model.pb new file mode 100755 index 0000000000..63f78fecb4 Binary files /dev/null and b/docs/examples/model_repository/simple_identity/1/model.savedmodel/saved_model.pb differ diff --git a/docs/examples/model_repository/simple_identity/config.pbtxt b/docs/examples/model_repository/simple_identity/config.pbtxt new file mode 100644 index 0000000000..fa7baee9c6 --- /dev/null +++ b/docs/examples/model_repository/simple_identity/config.pbtxt @@ -0,0 +1,19 @@ + +name: "simple_identity" +platform: "tensorflow_savedmodel" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ -1 ] + } +] diff --git a/docs/examples/model_repository/simple_int8/1/model.graphdef b/docs/examples/model_repository/simple_int8/1/model.graphdef new file mode 100755 index 0000000000..65cbc0dcf4 --- /dev/null +++ b/docs/examples/model_repository/simple_int8/1/model.graphdef @@ -0,0 +1,21 @@ + +@ +INPUT0 Placeholder* +dtype0* +shape: ÿÿÿÿÿÿÿÿÿ +@ +INPUT1 Placeholder* +dtype0* +shape: ÿÿÿÿÿÿÿÿÿ +# +ADDAddINPUT0INPUT1* +T0 +# +SUBSubINPUT0INPUT1* +T0 +! +OUTPUT0IdentityADD* +T0 +! +OUTPUT1IdentitySUB* +T0"† \ No newline at end of file diff --git a/docs/examples/model_repository/simple_int8/config.pbtxt b/docs/examples/model_repository/simple_int8/config.pbtxt new file mode 100644 index 0000000000..47e3324456 --- /dev/null +++ b/docs/examples/model_repository/simple_int8/config.pbtxt @@ -0,0 +1,27 @@ +name: "simple_int8" +platform: "tensorflow_graphdef" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] diff --git a/docs/examples/model_repository/simple_sequence/1/model.graphdef b/docs/examples/model_repository/simple_sequence/1/model.graphdef new file mode 100755 index 0000000000..d4c4bd6031 Binary files /dev/null and b/docs/examples/model_repository/simple_sequence/1/model.graphdef differ diff --git a/docs/examples/model_repository/simple_sequence/config.pbtxt b/docs/examples/model_repository/simple_sequence/config.pbtxt new file mode 100644 index 0000000000..1dd5c0da7c --- /dev/null +++ b/docs/examples/model_repository/simple_sequence/config.pbtxt @@ -0,0 +1,70 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple_sequence" +platform: "tensorflow_graphdef" +max_batch_size: 8 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + int32_false_true: [ 0, 1 ] + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/docs/examples/model_repository/simple_string/1/model.graphdef b/docs/examples/model_repository/simple_string/1/model.graphdef new file mode 100644 index 0000000000..d2d3db9180 Binary files /dev/null and b/docs/examples/model_repository/simple_string/1/model.graphdef differ diff --git a/docs/examples/model_repository/simple_string/config.pbtxt b/docs/examples/model_repository/simple_string/config.pbtxt new file mode 100644 index 0000000000..b01cd039b0 --- /dev/null +++ b/docs/examples/model_repository/simple_string/config.pbtxt @@ -0,0 +1,28 @@ + +name: "simple_string" +platform: "tensorflow_graphdef" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_STRING + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_STRING + dims: [ 16 ] + } +] diff --git a/docs/generate_docs.py b/docs/generate_docs.py new file mode 100755 index 0000000000..cb7ed02d9f --- /dev/null +++ b/docs/generate_docs.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import logging +import os +import re +import subprocess +from collections import defaultdict +from functools import partial + +from conf import exclude_patterns + +# Global constants +server_abspath = os.environ.get("SERVER_ABSPATH", os.getcwd()) +server_docs_abspath = os.path.join(server_abspath, "docs") + +""" +TODO: Needs to handle cross-branch linkage. + +For example, server/docs/user_guide/architecture.md on branch 24.09 links to +server/docs/user_guide/model_analyzer.md on main branch. In this case, the +hyperlink of model_analyzer.md should be a URL instead of relative path. + +Another example can be server/docs/user_guide/model_analyzer.md on branch 24.09 +links to a file in server repo with relative path. Currently all URLs are +hardcoded to main branch. We need to make sure that the URL actually points to the +correct branch. We also need to handle cases like deprecated or removed files from +older branch to avoid 404 error code. +""" +# Regex patterns +http_patn = r"^https?://" +http_reg = re.compile(http_patn) +tag_patn = "/(?:blob|tree)/main" +triton_repo_patn = rf"{http_patn}github.com/triton-inference-server" +triton_github_url_reg = re.compile( + rf"{triton_repo_patn}/([^/#]+)(?:{tag_patn})?/*([^#]*)\s*(?=#|$)" +) +# relpath_patn = r"]\s*\(\s*([^)]+)\)" +# Hyperlink in a .md file, excluding embedded images. +hyperlink_reg = re.compile(r"((?". + + Examples: + https://github.com/triton-inference-server/server/blob/main/docs/protocol#restricted-protocols + https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md + https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher + + Keep URL in the following cases: + https://github.com/triton-inference-server/server/tree/r24.02 + https://github.com/triton-inference-server/server/blob/main/build.py + https://github.com/triton-inference-server/server/blob/main/qa + https://github.com/triton-inference-server/server/blob/main/CONTRIBUTING.md + """ + m = triton_github_url_reg.match(url) + # Do not replace URL if it is not a Triton GitHub file. + if not m: + return url + + target_repo_name = m.group(1) + target_relpath_from_target_repo = os.path.normpath(m.groups("")[1]) + section = url[len(m.group(0)) :] + valid_hashtag = section not in ["", "#"] and section.startswith("#") + + if target_repo_name == "server": + target_path = os.path.join(server_abspath, target_relpath_from_target_repo) + else: + target_path = os.path.join( + server_docs_abspath, target_repo_name, target_relpath_from_target_repo + ) + + # Return URL if it points to a path outside server/docs. + if os.path.commonpath([server_docs_abspath, target_path]) != server_docs_abspath: + return url + + if ( + os.path.isfile(target_path) + and os.path.splitext(target_path)[1] == ".md" + and not is_excluded(target_path) + ): + pass + elif ( + os.path.isdir(target_path) + and os.path.isfile(os.path.join(target_path, "README.md")) + and valid_hashtag + and not is_excluded(os.path.join(target_path, "README.md")) + ): + target_path = os.path.join(target_path, "README.md") + else: + return url + + # The "target_path" must be a file at this line. + relpath = os.path.relpath(target_path, start=os.path.dirname(src_doc_path)) + return re.sub(triton_github_url_reg, relpath, url, 1) + + +def replace_relpath_with_url(relpath, src_doc_path): + """ + This function replaces relative paths with Triton Inference Server GitHub URLs in following cases. + 1. Relative path is a file that is not ".md" type inside the current repo. + 2. Relative path is a directory but not (has "README.md" and ends with "#
"). + 3. Relative path does not exist (shows 404 page). + + Examples: + ../examples/model_repository + ../examples/model_repository/inception_graphdef/config.pbtxt + + Keep relpath in the following cases: + build.md + build.md#building-with-docker + #building-with-docker + ../getting_started/quickstart.md + ../protocol#restricted-protocols + """ + target_path = relpath.rsplit("#")[0] + section = relpath[len(target_path) :] + valid_hashtag = section not in ["", "#"] + if relpath.startswith("#"): + target_path = os.path.basename(src_doc_path) + target_path = os.path.join(os.path.dirname(src_doc_path), target_path) + target_path = os.path.normpath(target_path) + src_git_repo_name = get_git_repo_name(src_doc_path) + + url = f"https://github.com/triton-inference-server/{src_git_repo_name}/blob/main/" + if src_git_repo_name == "server": + src_repo_abspath = server_abspath + # TODO: Assert the relative path not pointing to cloned repo, e.g. client. + # This requires more information which may be stored in a global variable. + else: + src_repo_abspath = os.path.join(server_docs_abspath, src_git_repo_name) + + # Assert target path is under the current repo directory. + assert os.path.commonpath([src_repo_abspath, target_path]) == src_repo_abspath + + target_path_from_src_repo = os.path.relpath(target_path, start=src_repo_abspath) + + # For example, target_path of "../protocol#restricted-protocols" should be "/server/docs/protocol/README.md" + if ( + os.path.isdir(target_path) + and valid_hashtag + and os.path.isfile(os.path.join(target_path, "README.md")) + ): + relpath = os.path.join(relpath.rsplit("#")[0], "README.md") + section + target_path = os.path.join(target_path, "README.md") + + if ( + os.path.isfile(target_path) + and os.path.splitext(target_path)[1] == ".md" + and os.path.commonpath([server_docs_abspath, target_path]) + == server_docs_abspath + and not is_excluded(target_path) + ): + return relpath + else: + return url + target_path_from_src_repo + section + + +def replace_hyperlink(m, src_doc_path): + """ + TODO: Support of HTML tags for future docs. + Markdown allows , e.g. ]+>. Whether we want to + find and replace the link depends on if they link to internal .md files + or allows relative paths. I haven't seen one such case in our doc so + should be safe for now. + """ + + hyperlink_str = m.group(2) + match = http_reg.match(hyperlink_str) + + if match: + # Hyperlink is a URL. + res = replace_url_with_relpath(hyperlink_str, src_doc_path) + else: + # Hyperlink is a relative path. + res = replace_relpath_with_url(hyperlink_str, src_doc_path) + + return m.group(1) + res + m.group(3) + + +def preprocess_docs(exclude_paths=[]): + # Find all ".md" files inside the current repo. + if exclude_paths: + cmd = ( + ["find", server_docs_abspath, "-type", "d", "\\("] + + " -o ".join([f"-path './{dir}'" for dir in exclude_paths]).split(" ") + + ["\\)", "-prune", "-o", "-type", "f", "-name", "'*.md'", "-print"] + ) + else: + cmd = ["find", server_docs_abspath, "-name", "'*.md'"] + cmd = " ".join(cmd) + result = subprocess.run(cmd, check=True, capture_output=True, text=True, shell=True) + docs_list = list(filter(None, result.stdout.split("\n"))) + + # Read, preprocess and write back to each document file. + for doc_abspath in docs_list: + if is_excluded(doc_abspath): + continue + + content = None + with open(doc_abspath, "r") as f: + content = f.read() + + content = hyperlink_reg.sub( + partial(replace_hyperlink, src_doc_path=doc_abspath), + content, + ) + + with open(doc_abspath, "w") as f: + f.write(content) + + +def main(): + args = parser.parse_args() + repo_tags = parse_repo_tag(args.repo_tag) if args.repo_tag else {} + backend_tags = parse_repo_tag(args.backend) if args.backend else {} + github_org = args.github_organization + + # Change working directory to server/docs. + os.chdir(server_docs_abspath) + run_command("make clean") + + # Usage generate_docs.py --repo-tag=client:main + if "client" in repo_tags: + clone_from_github("client", repo_tags["client"], github_org) + + # Usage generate_docs.py --repo-tag=perf_analyzer:main + if "perf_analyzer" in repo_tags: + clone_from_github("perf_analyzer", repo_tags["perf_analyzer"], github_org) + + # Usage generate_docs.py --repo-tag=python_backend:main + if "python_backend" in repo_tags: + clone_from_github("python_backend", repo_tags["python_backend"], github_org) + + # Usage generate_docs.py --repo-tag=tensorrtllm_backend:main + if "tensorrtllm_backend" in repo_tags: + clone_from_github( + "tensorrtllm_backend", repo_tags["tensorrtllm_backend"], github_org + ) + + # Usage generate_docs.py --backend-tag=custom_backend:main + # Custom backend can be anything currently empty + if "custom_backend" in backend_tags: + clone_from_github("custom_backend", backend_tags["custom_backend"], github_org) + + # Preprocess documents in server_docs_abspath after all repos are cloned. + preprocess_docs() + run_command("make html") + + # Clean up working directory. + if "client" in repo_tags: + run_command("rm -rf client") + if "python_backend" in repo_tags: + run_command("rm -rf python_backend") + if "custom_backend" in backend_tags: + run_command("rm -rf custom_backend") + if "tensorrtllm_backend" in repo_tags: + run_command("rm -rf tensorrtllm_backend") + if "perf_analyzer" in repo_tags: + run_command("rm -rf perf_analyzer") + + # Return to previous working directory server/. + os.chdir(server_abspath) + + +if __name__ == "__main__": + main() diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md new file mode 100644 index 0000000000..1d475e771e --- /dev/null +++ b/docs/getting_started/quickstart.md @@ -0,0 +1,164 @@ + + +# Quickstart + +**New to Triton Inference Server and want do just deploy your model quickly?** +Make use of +[these tutorials](https://github.com/triton-inference-server/tutorials#quick-deploy) + to begin your Triton journey! + +The Triton Inference Server is available as [buildable source + code](../customization_guide/build.md), but the easiest way to install and run Triton is to + use the pre-built Docker image available from the [NVIDIA GPU + Cloud (NGC)](https://ngc.nvidia.com). + +Launching and maintaining Triton Inference Server revolves around the use of building model repositories. This tutorial will cover: + +* Creating a Model Repository +* Launching Triton +* Send an Inference Request + +## Create A Model Repository + +The [model repository](../user_guide/model_repository.md) is the directory where you +place the models that you want Triton to serve. An example model +repository is included in the +[docs/examples/model_repository](../examples/model_repository). +Before using the repository, you must fetch any missing model definition +files from their public model zoos via the provided script. + +``` +$ cd docs/examples +$ ./fetch_models.sh +``` + +## Launch Triton + +Triton is optimized to provide the best inferencing performance by +using GPUs, but it can also work on CPU-only systems. In both cases +you can use the same Triton Docker image. + +### Run on System with GPUs + +Use the following command to run Triton with the example model +repository you just created. The [NVIDIA Container +Toolkit](https://github.com/NVIDIA/nvidia-docker) must be installed +for Docker to recognize the GPU(s). The --gpus=1 flag indicates that 1 +system GPU should be made available to Triton for inferencing. + +``` +$ docker run --gpus=1 --rm -p8000:8000 -p8001:8001 -p8002:8002 -v/full/path/to/docs/examples/model_repository:/models nvcr.io/nvidia/tritonserver:-py3 tritonserver --model-repository=/models +``` + +Where \ is the version of Triton that you want to use (and +pulled above). After you start Triton you will see output on the +console showing the server starting up and loading the model. When you +see output like the following, Triton is ready to accept inference +requests. + +``` ++----------------------+---------+--------+ +| Model | Version | Status | ++----------------------+---------+--------+ +| | | READY | +| .. | . | .. | +| .. | . | .. | ++----------------------+---------+--------+ +... +... +... +I1002 21:58:57.891440 62 grpc_server.cc:3914] Started GRPCInferenceService at 0.0.0.0:8001 +I1002 21:58:57.893177 62 http_server.cc:2717] Started HTTPService at 0.0.0.0:8000 +I1002 21:58:57.935518 62 http_server.cc:2736] Started Metrics Service at 0.0.0.0:8002 +``` +All the models should show "READY" status to indicate that they loaded correctly. If a model fails to load the status will report the failure and a reason for the failure. If your model is not displayed in the table check the path to the model repository and your CUDA drivers. + +### Run on CPU-Only System + +On a system without GPUs, Triton should be run without using the +--gpus flag to Docker, but is otherwise identical to what is described +above. + +``` +$ docker run --rm -p8000:8000 -p8001:8001 -p8002:8002 -v/full/path/to/docs/examples/model_repository:/models nvcr.io/nvidia/tritonserver:-py3 tritonserver --model-repository=/models +``` + +Because the --gpus flag is not used, a GPU is not available and Triton +will therefore be unable to load any model configuration that requires +a GPU. + +### Verify Triton Is Running Correctly + +Use Triton’s *ready* endpoint to verify that the server and the models +are ready for inference. From the host system use curl to access the +HTTP endpoint that indicates server status. + +``` +$ curl -v localhost:8000/v2/health/ready +... +< HTTP/1.1 200 OK +< Content-Length: 0 +< Content-Type: text/plain +``` + +The HTTP request returns status 200 if Triton is ready and non-200 if +it is not ready. + +## Send an Inference Request + +Use docker pull to get the client libraries and examples image +from NGC. + +``` +$ docker pull nvcr.io/nvidia/tritonserver:-py3-sdk +``` + +Where \ is the version that you want to pull. Run the client +image. + +``` +$ docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:-py3-sdk +``` + +From within the nvcr.io/nvidia/tritonserver:-py3-sdk +image, run the example image-client application to perform image +classification using the example densenet_onnx model. + +To send a request for the densenet_onnx model use an image from the +/workspace/images directory. In this case we ask for the top 3 +classifications. + +``` +$ /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg +Request 0, batch size 1 +Image '/workspace/images/mug.jpg': + 15.346230 (504) = COFFEE MUG + 13.224326 (968) = CUP + 10.422965 (505) = COFFEEPOT +``` diff --git a/docs/getting_started/trtllm_user_guide.md b/docs/getting_started/trtllm_user_guide.md new file mode 100644 index 0000000000..7f128e98c7 --- /dev/null +++ b/docs/getting_started/trtllm_user_guide.md @@ -0,0 +1,118 @@ + + +# TensorRT-LLM User Guide + +## What is TensorRT-LLM + +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) +(TRT-LLM) is an open-source library designed to accelerate and optimize the +inference performance of large language models (LLMs) on NVIDIA GPUs. TRT-LLM +offers users an easy-to-use Python API to build TensorRT engines for LLMs, +incorporating state-of-the-art optimizations to ensure efficient inference on +NVIDIA GPUs. + +## How to run TRT-LLM models with Triton Server via TensorRT-LLM backend + +The +[TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend) +lets you serve TensorRT-LLM models with Triton Inference Server. Check out the +[Getting Started](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#getting-started) +section in the TensorRT-LLM Backend repo to learn how to utlize the +[NGC Triton TRT-LLM container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +to prepare engines for your LLM models and serve them with Triton. + +## How to use your custom TRT-LLM model + +All the supported models can be found in the +[examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) folder in +the TRT-LLM repo. Follow the examples to convert your models to TensorRT +engines. + +After the engine is built, [prepare the model repository](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#prepare-the-model-repository) +for Triton, and +[modify the model configuration](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration). + +Only the *mandatory parameters* need to be set in the model config file. Feel free +to modify the optional parameters as needed. To learn more about the +parameters, model inputs, and outputs, see the +[model config documentation](ttps://github.com/triton-inference-server/tensorrtllm_backend/blob/main/docs/model_config.md) for more details. + +## Advanced Configuration Options and Deployment Strategies + +Explore advanced configuration options and deployment strategies to optimize +and run Triton with your TRT-LLM models effectively: + +- [Model Deployment](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#model-deployment): Techniques for efficiently deploying and managing your models in various environments. +- [Multi-Instance GPU (MIG) Support](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#mig-support): Run Triton and TRT-LLM models with MIG to optimize GPU resource management. +- [Scheduling](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#scheduling): Configure scheduling policies to control how requests are managed and executed. +- [Key-Value Cache](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#key-value-cache): Utlizte KV cache and KV cache reuse to optimize memory usage and improve performance. +- [Decoding](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#decoding): Advanced methods for generating text, including top-k, top-p, top-k top-p, beam search, Medusa, and speculative decoding. +- [Chunked Context](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#chunked-context): Splitting the context into several chunks and batching them during generation phase to increase overall throughput. +- [Quantization](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#quantization): Apply quantization techniques to reduce model size and enhance inference speed. +- [LoRa (Low-Rank Adaptation)](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main?tab=readme-ov-file#lora): Use LoRa for efficient model fine-tuning and adaptation. + +## Tutorials + +Make sure to check out the +[tutorials](https://github.com/triton-inference-server/tutorials) repo to see +more guides on serving popular LLM models with Triton Server and TensorRT-LLM, +as well as deploying them on Kubernetes. + +## Benchmark + +[GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) +is a command line tool for measuring the throughput and latency of LLMs served +by Triton Inference Server. Check out the +[Quick Start](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf#quick-start) +to learn how to use GenAI-Perf to benchmark your LLM models. + +## Performance Best Practices + +Check out the +[Performance Best Practices guide](https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html) +to learn how to optimize your TensorRT-LLM models for better performance. + +## Metrics + +Triton Server provides +[metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md) +indicating GPU and request statistics. +See the +[Triton Metrics](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics) +section in the TensorRT-LLM Backend repo to learn how to query the Triton +metrics endpoint to obtain TRT-LLM statistics. + +## Ask questions or report issues + +Can't find what you're looking for, or have a question or issue? Feel free to +ask questions or report issues in the GitHub issues page: + +- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/issues) +- [TensorRT-LLM Backend](https://github.com/triton-inference-server/tensorrtllm_backend/issues) +- [Triton Inference Server](https://github.com/triton-inference-server/server/issues) diff --git a/docs/http_grpc_api.rst b/docs/http_grpc_api.rst deleted file mode 100644 index 1f5aba85ce..0000000000 --- a/docs/http_grpc_api.rst +++ /dev/null @@ -1,180 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -.. _section-inference-server-api: - -Inference Server API -==================== - -The TensorRT Inference Server exposes both HTTP and GRPC -endpoints. Three endpoints with identical functionality are exposed -for each protocol. - -* :ref:`section-api-health`: The server health API for determining - server liveness and readiness. - -* :ref:`section-api-status`: The server status API for getting - information about the server and about the models being served. - -* :ref:`section-api-inference`: The inference API that accepts model - inputs, runs inference and returns the requested outputs. - -The HTTP endpoints can be used directly as described in this section, -but for most use-cases, the preferred way to access TRTIS is via the -`C++ and Python Client libraries -`. - -The GRPC endpoints can also be used via the `C++ and Python Client -libraries ` or a GRPC-generated -API can be used directly as shown in the grpc_image_client.py example. - -.. _section-api-health: - -Health ------- - -Performing an HTTP GET to /api/health/live returns a 200 status if the -server is able to receive and process requests. Any other status code -indicates that the server is still initializing or has failed in some -way that prevents it from processing requests. - -Once the liveness endpoint indicates that the server is active, -performing an HTTP GET to /api/health/ready returns a 200 status if -the server is able to respond to inference requests for some or all -models (based on TRTIS's -\\-strict-readiness option explained -below). Any other status code indicates that the server is not ready -to respond to some or all inference requests. - -For GRPC the :cpp:var:`GRPCService -` uses the -:cpp:var:`HealthRequest ` and -:cpp:var:`HealthResponse ` -messages to implement the endpoint. - -By default, the readiness endpoint will return success if the server -is responsive and all models loaded successfully. Thus, by default, -success indicates that an inference request for any model can be -handled by the server. For some use cases, you want the readiness -endpoint to return success even if all models are not available. In -this case, use the -\\-strict-readiness=false option to cause the -readiness endpoint to report success as long as the server is -responsive (even if one or more models are not available). - -.. _section-api-status: - -Status ------- - -Performing an HTTP GET to /api/status returns status information about -the server and all the models being served. Performing an HTTP GET to -/api/status/ returns information about the server and the -single model specified by . The server status is returned -in the HTTP response body in either text format (the default) or in -binary format if query parameter format=binary is specified (for -example, /api/status?format=binary). The success or failure of the -status request is indicated in the HTTP response code and the -**NV-Status** response header. The **NV-Status** response header -returns a text protobuf formatted :cpp:var:`RequestStatus -` message. - -For GRPC the :cpp:var:`GRPCService -` uses the -:cpp:var:`StatusRequest ` and -:cpp:var:`StatusResponse ` -messages to implement the endpoint. The response includes a -:cpp:var:`RequestStatus ` -message indicating success or failure. - -For either protocol the status itself is returned as a -:cpp:var:`ServerStatus ` -message. - -.. _section-api-inference: - -Inference ---------- - -Performing an HTTP POST to /api/infer/ performs inference -using the latest version of the model that is being made available by -the model's :ref:`version policy `. The latest -version is the numerically greatest version number. Performing an HTTP -POST to /api/infer// performs inference -using a specific version of the model. - -The request uses the **NV-InferRequest** header to communicate an -:cpp:var:`InferRequestHeader -` message that describes -the input tensors and the requested output tensors. For example, for a -resnet50 model the following **NV-InferRequest** header indicates that -a batch-size 1 request is being made with input size of 602112 bytes -(3 * 224 * 224 * sizeof(FP32)), and that the result of the tensor -named "output" should be returned as the top-3 classification values:: - - NV-InferRequest: batch_size: 1 input { name: "input" byte_size: 602112 } output { name: "output" byte_size: 4000 cls { count: 3 } } - -The input tensor values are communicated in the body of the HTTP POST -request as raw binary in the order as the inputs are listed in the -request header. - -The inference results are returned in the body of the HTTP response to -the POST request. For outputs where full result tensors were -requested, the result values are communicated in the body of the -response in the order as the outputs are listed in the request -header. After those, an :cpp:var:`InferResponseHeader -` message is appended to -the response body. The :cpp:var:`InferResponseHeader -` message is returned in -either text format (the default) or in binary format if query -parameter format=binary is specified (for example, -/api/infer/foo?format=binary). - -For example, assuming outputs specified in the -:cpp:var:`InferResponseHeader -` in order are -“output0â€, “output1â€, …, “outputnâ€, the response body would contain:: - - - - ... - - - -The success or failure of the inference request is indicated in the -HTTP response code and the **NV-Status** response header. The -**NV-Status** response header returns a text protobuf formatted -:cpp:var:`RequestStatus ` -message. - -For GRPC the :cpp:var:`GRPCService -` uses the -:cpp:var:`InferRequest ` and -:cpp:var:`InferResponse ` -messages to implement the endpoint. The response includes a -:cpp:var:`RequestStatus ` -message indicating success or failure, :cpp:var:`InferResponseHeader -` message giving -response meta-data, and the raw output tensors. diff --git a/docs/images/arch.png b/docs/images/arch.png deleted file mode 100644 index 949d56ab3a..0000000000 Binary files a/docs/images/arch.png and /dev/null differ diff --git a/docs/images/cuda_stream_exec.png b/docs/images/cuda_stream_exec.png deleted file mode 100644 index f9f152c195..0000000000 Binary files a/docs/images/cuda_stream_exec.png and /dev/null differ diff --git a/docs/images/multi_model_exec.png b/docs/images/multi_model_exec.png deleted file mode 100644 index b77e3a1117..0000000000 Binary files a/docs/images/multi_model_exec.png and /dev/null differ diff --git a/docs/images/multi_model_parallel_exec.png b/docs/images/multi_model_parallel_exec.png deleted file mode 100644 index 53e6647195..0000000000 Binary files a/docs/images/multi_model_parallel_exec.png and /dev/null differ diff --git a/docs/images/multi_model_serial_exec.png b/docs/images/multi_model_serial_exec.png deleted file mode 100644 index 1b6532a920..0000000000 Binary files a/docs/images/multi_model_serial_exec.png and /dev/null differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000000..6d42750eaa --- /dev/null +++ b/docs/index.md @@ -0,0 +1,106 @@ + + +::::{grid} +:reverse: +:gutter: 2 1 1 1 +:margin: 4 4 1 1 + +:::{grid-item} +:columns: 4 + +```{image} ./_static/nvidia-logo-vert-rgb-blk-for-screen.png +:width: 300px +``` +::: +:::{grid-item} +:columns: 8 +:class: sd-fs-3 + +NVIDIA Triton Inference Server + +::: +:::: + +Triton Inference Server is an open source inference serving software that streamlines AI inferencing. + + + +
+ +
+ +# Triton Inference Server + +Triton Inference Server enables teams to deploy any AI model from multiple deep +learning and machine learning frameworks, including TensorRT, TensorFlow, +PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton supports inference +across cloud, data center, edge and embedded devices on NVIDIA GPUs, x86 and ARM +CPU, or AWS Inferentia. Triton Inference Server delivers optimized performance +for many query types, including real time, batched, ensembles and audio/video +streaming. Triton inference Server is part of +[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), +a software platform that accelerates the data science pipeline and streamlines +the development and deployment of production AI. + +Major features include: + +- [Supports multiple deep learning + frameworks](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton) +- [Supports multiple machine learning + frameworks](https://github.com/triton-inference-server/fil_backend) +- [Concurrent model + execution](user_guide/architecture.md#concurrent-model-execution) +- [Dynamic batching](user_guide/model_configuration.md#dynamic-batcher) +- [Sequence batching](user_guide/model_configuration.md#sequence-batcher) and + [implicit state management](user_guide/architecture.md#implicit-state-management) + for stateful models +- Provides [Backend API](https://github.com/triton-inference-server/backend) that + allows adding custom backends and pre/post processing operations +- Model pipelines using + [Ensembling](user_guide/architecture.md#ensemble-models) or [Business + Logic Scripting + (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting) +- [HTTP/REST and GRPC inference + protocols](customization_guide/inference_protocols.md) based on the community + developed [KServe + protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2) +- A [C API](customization_guide/inference_protocols.md#in-process-triton-server-api) and + [Java API](customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api) + allow Triton to link directly into your application for edge and other in-process use cases +- [Metrics](user_guide/metrics.md) indicating GPU utilization, server + throughput, server latency, and more + +Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best +practices, and more. Need enterprise support? NVIDIA global support is available +for Triton Inference Server with the [NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). + +See the [Latest Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-23-05.html#rel-23-05) for updates on the newest features and bug fixes. diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index cb2e55c992..0000000000 --- a/docs/index.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -NVIDIA TensorRT Inference Server -================================ - -.. ifconfig:: "dev" in release - - .. warning:: - You are currently viewing unstable developer preview - of the documentation. To see the documentation for the latest - stable release click `here - `_. - -.. include:: ../README.rst - :start-after: overview-begin-marker-do-not-remove - :end-before: overview-end-marker-do-not-remove - -.. toctree:: - :hidden: - - Documentation home - -.. toctree:: - :maxdepth: 2 - :caption: User Guide - - quickstart - install - run - client - model_repository - model_configuration - http_grpc_api - metrics - -.. toctree:: - :maxdepth: 2 - :caption: Developer Guide - - architecture - contribute - build - test - -.. toctree:: - :maxdepth: 2 - :caption: API Reference - - protobuf_api/protobuf_api_root - cpp_api/cpp_api_root - python_api - - -Indices and tables -================== - -* :ref:`genindex` diff --git a/docs/install.rst b/docs/install.rst deleted file mode 100644 index ff7f56b497..0000000000 --- a/docs/install.rst +++ /dev/null @@ -1,59 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Installing the Server -===================== - -The TensorRT Inference Server is available as a pre-built Docker -container or you can :ref:`build it from source -`. - -.. _section-installing-prebuilt-containers: - -Installing Prebuilt Containers ------------------------------- - -The inference server is provided as a pre-built container on the -`NVIDIA GPU Cloud (NGC) `_. Before pulling the -container you must have access and be logged into the NGC container -registry as explained in the `NGC Getting Started Guide -`_. - -Before you can pull a container from the NGC container registry, you -must have Docker and nvidia-docker installed. For DGX users, this is -explained in `Preparing to use NVIDIA Containers Getting Started Guide -`_. -For users other than DGX, follow the `nvidia-docker installation -documentation `_ to install -the most recent version of CUDA, Docker, and nvidia-docker. - -After performing the above setup, you can pull the TensorRT Inference -Server container using the following command:: - - docker pull nvcr.io/nvidia/tensorrtserver:18.11-py3 - -Replace *18.11* with the version of TRTIS that you want to pull. diff --git a/docs/metrics.rst b/docs/metrics.rst deleted file mode 100644 index cc26f7e8ea..0000000000 --- a/docs/metrics.rst +++ /dev/null @@ -1,93 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -.. _section-metrics: - -Metrics -======= - -The TensorRT Inference server provides `Prometheus -`_ metrics indicating GPU and request -statistics. By default, these metrics are available at -http://localhost:8002/metrics. The TRTIS -\\-metrics-port option can -be used to select a different port. The following table describes the -available metrics. - -+--------------+----------------+---------------------------------------+-----------+-----------+ -|Category |Metric |Description |Granularity|Frequency | -| | | | | | -+==============+================+=======================================+===========+===========+ -|| GPU |Power Usage |GPU instantaneous power |Per GPU |Per second | -|| Utilization | | | | | -| | | | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| |Power Limit |Maximum GPU power limit |Per GPU |Per second | -| | | | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| || Energy || GPU energy consumption in joules |Per GPU |Per second | -| || Consumption || since the server started | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| |GPU Utilization || GPU utilization rate |Per GPU |Per second | -| | || (0.0 - 1.0) | | | -+--------------+----------------+---------------------------------------+-----------+-----------+ -|| GPU || GPU Total || Total GPU memory, in bytes |Per GPU |Per second | -|| Memory || Memory | | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| || GPU Used || Used GPU memory, in bytes |Per GPU |Per second | -| || Memory | | | | -+--------------+----------------+---------------------------------------+-----------+-----------+ -|Count |Request Count || Number of inference requests |Per model |Per request| -| | | | | | -| | | | | | -| | | | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| |Execution Count || Number of inference executions |Per model |Per request| -| | || (request count / execution count | | | -| | || = average dynamic batch size) | | | -| | | | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| |Inference Count || Number of inferences performed |Per model |Per request| -| | || (one request counts as | | | -| | || "batch size" inferences) | | | -| | | | | | -+--------------+----------------+---------------------------------------+-----------+-----------+ -|Latency |Request Time || End-to-end inference request |Per model |Per request| -| | || handling time | | | -| | | | | | -| | | | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| |Compute Time || Time a request spends executing |Per model |Per request| -| | || the inference model (in the | | | -| | || framework backend) | | | -| | | | | | -+ +----------------+---------------------------------------+-----------+-----------+ -| |Queue Time || Time a request spends waiting |Per model |Per request| -| | || in the queue | | | -| | | | | | -| | | | | | -| | | | | | -+--------------+----------------+---------------------------------------+-----------+-----------+ diff --git a/docs/model_configuration.rst b/docs/model_configuration.rst deleted file mode 100644 index d038788e12..0000000000 --- a/docs/model_configuration.rst +++ /dev/null @@ -1,305 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -.. _section-model-configuration: - -Model Configuration -=================== - -Each model in a :ref:`section-model-repository` must include a model -configuration that provides required and optional information about -the model. Typically, this configuration is provided in a config.pbtxt -file specified as :doc:`ModelConfig ` -protobuf. In some cases, discussed in -:ref:`section-generated-model-configuration`, the model configuration -can be generated automatically by TRTIS and so does not need to be -provided explicitly. - -A minimal model configuration must specify :cpp:var:`name -`, :cpp:var:`platform -`, -:cpp:var:`max_batch_size -`, -:cpp:var:`input `, and -:cpp:var:`output `. - -As a running example consider a TensorRT model called *mymodel* that -has two inputs, *input0* and *input1*, and one output, *output0*, all -of which are 16 entry float32 tensors. The minimal configuration is:: - - name: "mymodel" - platform: "tensorrt_plan" - max_batch_size: 8 - input [ - { - name: "input0" - data_type: TYPE_FP32 - dims: [ 16 ] - }, - { - name: "input1" - data_type: TYPE_FP32 - dims: [ 16 ] - } - ] - output [ - { - name: "output0" - data_type: TYPE_FP32 - dims: [ 16 ] - } - ] - -The name of the model must match the :cpp:var:`name -` of the model repository -directory containing the model. The :cpp:var:`platform -` must be one of -**tensorrt_plan**, **tensorflow_graphdef**, **tensorflow_savedmodel**, -or **caffe2_netdef**. - -For models that support batched inputs the :cpp:var:`max_batch_size -` value must be ->= 1. The TensorRT Inference Server assumes that the batching occurs -along a first dimension that is not listed in the inputs or -outputs. For the above example TRTIS expects to receive input tensors -with shape **[ x, 16 ]** and produces an output tensor with shape **[ -x, 16 ]**, where **x** is the batch size of the request. - -For models that do not support batched inputs the -:cpp:var:`max_batch_size -` value must be -zero. If the above example specified a :cpp:var:`max_batch_size -` of zero, TRTIS -would expect to receive input tensors with shape **[ 16 ]**, and would -produce an output tensor with shape **[ 16 ]**. - -.. _section-generated-model-configuration: - -Generated Model Configuration ------------------------------ - -By default, the model configuration file containing the required -settings must be provided with each model. However, if TRTIS is -started with the -\\-strict-model-config=false option, then in some -cases the required portions of the model configuration file can be -generated automatically by TRTIS. The required portion of the model -configuration are those settings shown in the example minimal -configuration above. Specifically: - -* :ref:`TensorRT Plan ` models do not require - a model configuration file because TRTIS can derive all the required - settings automatically. - -* Some :ref:`TensorFlow SavedModel ` models - do not require a model configuration file. The models must specify - all inputs and outputs as fixed-size tensors (with an optional - initial batch dimension) for the model configuration to be generated - automatically. The easiest way to determine if a particular - SavedModel is supported is to try it with TRTIS and check the - console log and :ref:`Status API ` to determine - if the model loaded successfully. - -When using -\\-strict-model-config=false you can see the model -configuration that was generated for a model by using the :ref:`Status -API `. - -The TensorRT Inference Server only generates the required portion of -the model configuration file. You must still provide the optional -portions of the model configuration if necessary, such as -:cpp:var:`version_policy -`, -:cpp:var:`optimization -`, -:cpp:var:`dynamic_batching -`, -:cpp:var:`instance_group -`, -:cpp:var:`default_model_filename -`, -:cpp:var:`cc_model_filenames -`, and -:cpp:var:`tags `. - -.. _section-version-policy: - -Version Policy --------------- - -Each model can have one or more :ref:`versions available in the model -repository `. The -:cpp:var:`nvidia::inferenceserver::ModelVersionPolicy` schema allows -the following policies. - -* :cpp:var:`All - `: All versions - of the model that are available in the model repository are - available for inferencing. - -* :cpp:var:`Latest - `: Only the - latest ‘n’ versions of the model in the repository are available for - inferencing. The latest versions of the model are the numerically - greatest version numbers. - -* :cpp:var:`Specific - `: Only the - specifically listed versions of the model are available for - inferencing. - -If no version policy is specified, then :cpp:var:`Latest -` (with -num_version = 1) is used as the default, indicating that only the most -recent version of the model is made available by TRTIS. In all cases, -the addition or removal of version subdirectories from the model -repository can change which model version is used on subsequent -inference requests. - -Continuing the above example, the following configuration specifies -that all versions of the model will be available from TRTIS:: - - name: "mymodel" - platform: "tensorrt_plan" - max_batch_size: 8 - input [ - { - name: "input0" - data_type: TYPE_FP32 - dims: [ 16 ] - }, - { - name: "input1" - data_type: TYPE_FP32 - dims: [ 16 ] - } - ] - output [ - { - name: "output0" - data_type: TYPE_FP32 - dims: [ 16 ] - } - ] - version_policy: { all { }} - -.. _section-instance-groups: - -Instance Groups ---------------- - -The TensorRT Inference Server can provide multiple :ref:`execution -instances ` of a model so that -multiple simultaneous inference requests for that model can be handled -simultaneously. The model configuration :cpp:var:`ModelInstanceGroup -` is used to specify the -number of execution instances that should be made available and what -compute resource should be used for those instances. - -By default, a single execution instance of the model is created for -each GPU available in the system. The instance-group setting can be -used to place multiple execution instances of a model on every GPU or -on only certain GPUs. For example, the following configuration will -place two execution instances of the model to be available on each -system GPU:: - - instance_group [ - { - count: 2 - kind: KIND_GPU - } - ] - -And the following configuration will place one execution instance on -GPU 0 and two execution instances on GPUs 1 and 2:: - - instance_group [ - { - count: 1 - kind: KIND_GPU - gpus: [ 0 ] - }, - { - count: 2 - kind: KIND_GPU - gpus: [ 1, 2 ] - } - ] - -The instance group setting is also used to enable exection of a model -on the CPU. The following places two execution instances on the CPU:: - - instance_group [ - { - count: 2 - kind: KIND_CPU - } - ] - -.. _section-dynamic-batching: - -Dynamic Batching ----------------- - -The TensorRT Inference Server supports batch inferencing by allowing -individual inference requests to specify a batch of inputs. The -inferencing for a batch of inputs is processed at the same time which -is especially important for GPUs since it can greatly increase -inferencing throughput. In many use-cases the individual inference -requests are not batched, therefore, they do not benefit from the -throughput benefits of batching. - -Dynamic batching is a feature of TRTIS that allows non-batched -inference requests to be combined by TRTIS, so that a batch is created -dynamically, resulting in the same increased throughput seen for -batched inference requests. - -Dynamic batching is enabled and configured independently for each -model using the :cpp:var:`ModelDynamicBatching -` settings in the model -configuration. These settings control the preferred size(s) of the -dynamically created batches as well as a maximum time that requests -can be delayed in the scheduler to allow other requests to join the -dynamic batch. - -The following configuration enables dynamic batching with preferred -batch sizes of 4 and 8, and a maximum delay time of 100 microseconds:: - - dynamic_batching { - preferred_batch_size: [ 4, 8 ] - max_queue_delay_microseconds: 100 - } - -.. _section-optimization-policy: - -Optimization Policy -------------------- - -The model configuration :cpp:var:`ModelOptimizationPolicy -` is used to specify -optimization and prioritization settings for a model. These settings -control if/how a model is optimized by the backend framework and how -it is scheduled and executed by TRTIS. See the protobuf documentation -for the currently available settings. diff --git a/docs/model_repository.rst b/docs/model_repository.rst deleted file mode 100644 index 551dc1d952..0000000000 --- a/docs/model_repository.rst +++ /dev/null @@ -1,297 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -.. _section-model-repository: - -Model Repository -================ - -The TensorRT Inference Server accesses models from a locally -accessible file path or from Google Cloud Storage. This path is -specified when the server is started using the -\\-model-store option. - -For a locally accessible file-system the absolute path must be -specified, for example, -\\-model-store=/path/to/model/repository. For -a model repository residing in Google Cloud Storage, the path must be -prefixed with gs://, for example, --\\-model-store=gs://bucket/path/to/model/repository. - -:ref:`section-example-model-repository` describes how to create an -example repository with a couple if image classification models. - -An example of a typical model repository layout is shown below:: - - / - model_0/ - config.pbtxt - output0_labels.txt - 1/ - model.plan - 2/ - model.plan - model_1/ - config.pbtxt - output0_labels.txt - output1_labels.txt - 0/ - model.graphdef - 7/ - model.graphdef - -Any number of models may be specified and TRTIS will attempt to load -all models into the CPU and GPU when the server starts. The -:ref:`Status API ` can be used to determine if any -models failed to load successfully. The server's console log will also -show the reason for any failures during startup. - -The name of the model directory (model_0 and model_1 in the above -example) must match the name of the model specified in the -:ref:`model configuration file `, -config.pbtxt. The model name is used in the :ref:`client API -` and :ref:`server API -` to identify the model. Each model -directory must have at least one numeric subdirectory. Each of these -subdirectories holds a version of the model with the version number -corresponding to the directory name. - -For more information about how the model versions are handled by the -server see :ref:`section-model-versions`. Within each version -subdirectory there are one or more model definition files. For more -information about the model definition files contained in each version -subdirectory see :ref:`section-model-definition`. - -The \*_labels.txt files are optional and are used to provide labels for -outputs that represent classifications. The label file must be -specified in the :cpp:var:`label_filename -` property of -the output it corresponds to in the :ref:`model configuration -`. - -.. _section-modifying-the-model-repository: - -Modifying the Model Repository ------------------------------- - -By default, changes to the model repository will be detected by a -running TRTIS and the server will attempt to add, remove, and reload -models as necessary based on those changes. Changes to the model -repository may not be detected immediately because TRTIS polls the -repository periodically. You can control the polling interval with the --\\-repository-poll-secs options. The console log or the :ref:`Status -API ` can be used to determine when model -repository changes have taken effect. You can disable the server from -responding to repository changes by using the --\\-allow-poll-model-repository=false option. - -The TensorRT Inference Server responds to the following changes: - -* Versions may be added and removed from models by adding and removing - the corresponding version subdirectory. The inference server will - allow in-flight requests to complete even if they are using a - removed version of the model. New requests for a removed model - version will fail. Depending on the model's :ref:`version policy - `, changes to the available versions may - change which model version is served by default. - -* Existing models can be removed from the repository by removing the - corresponding model directory. TRTIS will allow in-flight requests - to any version of the removed model to complete. New requests for a - removed model will fail. - -* New models can be added to the repository by adding a new model - directory. - -* The :ref:`model configuration ` - (config.pbtxt) can be changed and TRTIS will unload and reload the - model to pick up the new model configuration. - -* Labels files providing labels for outputs that represent - classifications can be added, removed, or modified and TRTIS will - unload and reload the model to pick up the new labels. If a label - file is added or removed the corresponding edit to the - :cpp:var:`label_filename - ` property of - the output it corresponds to in the :ref:`model configuration - ` must be performed at the same time. - -.. _section-model-versions: - -Model Versions --------------- - -Each model can have one or more versions available in the model -repository. Each version is stored in its own, numerically named, -subdirectory where the name of the subdirectory corresponds to the -version number of the model. Each model specifies a :ref:`version -policy ` that controls which of the versions -in the model repository are made available by TRTIS at any given time. - -.. _section-model-definition: - -Model Definition ----------------- - -Each model version subdirectory must contain at least one model -definition. By default, the name of this file or directory must be: - -* **model.plan** for TensorRT models -* **model.graphdef** for TensorFlow GraphDef models -* **model.savedmodel** for TensorFlow SavedModel models -* **model.netdef** and **init_model.netdef** for Caffe2 Netdef models - -This default name can be overridden using the *default_model_filename* -property in the :ref:`model configuration -`. - -Optionally, a model can provide multiple model definition files, each -targeted at a GPU with a different `Compute Capability -`_. Most commonly, this -feature is needed for TensorRT and TensorFlow/TensorRT integrated -models where the model definition is valid for only a single compute -capability. See the *cc_model_filenames* property in the :ref:`model -configuration ` for description of how to -specify different model definitions for different compute -capabilities. - -.. _section-tensorrt-models: - -TensorRT Models -^^^^^^^^^^^^^^^ - -A TensorRT model definition is called a *Plan*. A TensorRT Plan is a -single file that by default must be named model.plan. A TensorRT Plan -is specific to CUDA Compute Capability and so it is typically -necessary to use the :ref:`model configuration's -` *cc_model_filenames* property as -described above. - -A minimal model repository for a single TensorRT model would look -like:: - - models/ - / - config.pbtxt - 1/ - model.plan - -As described in :ref:`section-generated-model-configuration` the -config.pbtxt is optional for some models. In cases where it is not -required the minimal model repository would look like:: - - models/ - / - 1/ - model.plan - -.. _section-tensorflow-models: - -TensorFlow Models -^^^^^^^^^^^^^^^^^ - -TensorFlow saves trained models in one of two ways: *GraphDef* or -*SavedModel*. The inference server supports both formats. Once you -have a trained model in TensorFlow, you can save it as a GraphDef -directly or convert it to a GraphDef by using a script like -`freeze_graph.py -`_, -or save it as a SavedModel using a `SavedModelBuilder -`_ or -`tf.saved_model.simple_save -`_. - -A TensorFlow GraphDef is a single file that by default must be named -model.graphdef. A minimal model repository for a single TensorFlow -GraphDef model would look like:: - - models/ - / - config.pbtxt - 1/ - model.graphdef - -A TensorFlow SavedModel is a directory containing multiple files. By -default the directory must be named model.savedmodel. A minimal model -repository for a single TensorFlow SavedModel model would look like:: - - models/ - / - config.pbtxt - 1/ - model.savedmodel/ - - -As described in :ref:`section-generated-model-configuration` the -config.pbtxt is optional for some models. In cases where it is not -required the minimal model repository would look like:: - - models/ - / - 1/ - model.savedmodel/ - - -Caffe2 Models -^^^^^^^^^^^^^ - -A Caffe2 model definition is called a *NetDef*. A Caffe2 NetDef is a -single file that by default must be named model.netdef. A minimal -model repository for a single NetDef model would look like:: - - models/ - / - config.pbtxt - 1/ - model.netdef - -TensorRT/TensorFlow Models -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -TensorFlow 1.7 and later integrates TensorRT to enable TensorFlow -models to benefit from the inference optimizations provided by -TensorRT. TRTIS supports models that have been optimized with TensorRT -and can serve those models just like any other TensorFlow model. The -inference server’s TensorRT version (available in the Release Notes) -must match the TensorRT version that was used when the model was -created. - -A TensorRT/TensorFlow integrated model is specific to CUDA Compute -Capability and so it is typically necessary to use the :ref:`model -configuration's ` *cc_model_filenames* -property as described above. - -ONNX Models -^^^^^^^^^^^ - -The TensorRT Inference Server cannot directly perform inferencing -using `ONNX `_ models. An ONNX model must be -converted to either a TensorRT Plan or a Caffe2 NetDef. To convert -your ONNX model to a TensorRT Plan use either the `ONNX Parser -`_ -included in TensorRT or the `open-source TensorRT backend for ONNX -`_. Another option is to -convert your ONNX model to Caffe2 NetDef `as described here -`_. diff --git a/docs/protobuf_api/gen_proto_doc.sh b/docs/protobuf_api/gen_proto_doc.sh deleted file mode 100755 index 5c86fdb881..0000000000 --- a/docs/protobuf_api/gen_proto_doc.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -for pf in "$@"; do - bname=$(basename $pf) - echo -e "$bname" > $bname.rst - echo -e "====================\n" >> $bname.rst - sed -n -e 's/.*\/\/@@\(.*\)/\1/p' $pf >> $bname.rst -done diff --git a/docs/protobuf_api/protobuf_api_root.rst b/docs/protobuf_api/protobuf_api_root.rst deleted file mode 100644 index 696c46a565..0000000000 --- a/docs/protobuf_api/protobuf_api_root.rst +++ /dev/null @@ -1,43 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Protobuf API -============ - -HTTP/GRPC API -------------- -* :doc:`src/core/api.proto ` -* :doc:`src/core/grpc_service.proto ` -* :doc:`src/core/request_status.proto ` - -Model Configuration -------------------- -* :doc:`src/core/model_config.proto ` - -Status ------- -* :doc:`src/core/server_status.proto ` diff --git a/docs/protocol/README.md b/docs/protocol/README.md new file mode 100644 index 0000000000..3ad2517e32 --- /dev/null +++ b/docs/protocol/README.md @@ -0,0 +1,118 @@ + + +# HTTP/REST and GRPC Protocol + +This directory contains documents related to the HTTP/REST and GRPC +protocols used by Triton. Triton uses the [KServe community standard +inference +protocols](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2) +plus several extensions that are defined in the following documents: + +- [Binary tensor data extension](./extension_binary_data.md) +- [Classification extension](./extension_classification.md) +- [Schedule policy extension](./extension_schedule_policy.md) +- [Sequence extension](./extension_sequence.md) +- [Shared-memory extension](./extension_shared_memory.md) +- [Model configuration extension](./extension_model_configuration.md) +- [Model repository extension](./extension_model_repository.md) +- [Statistics extension](./extension_statistics.md) +- [Trace extension](./extension_trace.md) +- [Logging extension](./extension_logging.md) +- [Parameters extension](./extension_parameters.md) + +Note that some extensions introduce new fields onto the inference protocols, +and the other extensions define new protocols that Triton follows, please refer +to the extension documents for detail. + +For the GRPC protocol, the [protobuf +specification](https://github.com/triton-inference-server/common/blob/main/protobuf/grpc_service.proto) +is also available. In addition, you can find the GRPC health checking protocol protobuf +specification [here](https://github.com/triton-inference-server/common/blob/main/protobuf/health.proto). + +## Restricted Protocols + +You can configure the Triton endpoints, which implement the protocols, to +restrict access to some protocols and to control network settings, please refer +to [protocol customization guide](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols) for detail. + +## IPv6 + +Assuming your host or [docker config](https://docs.docker.com/config/daemon/ipv6/) +supports IPv6 connections, `tritonserver` can be configured to use IPv6 +HTTP endpoints as follows: +``` +$ tritonserver ... --http-address ipv6:[::1]& +... +I0215 21:04:11.572305 571 grpc_server.cc:4868] Started GRPCInferenceService at 0.0.0.0:8001 +I0215 21:04:11.572528 571 http_server.cc:3477] Started HTTPService at ipv6:[::1]:8000 +I0215 21:04:11.614167 571 http_server.cc:184] Started Metrics Service at ipv6:[::1]:8002 +``` + +This can be confirmed via `netstat`, for example: +``` +$ netstat -tulpn | grep tritonserver +tcp6 0 0 :::8000 :::* LISTEN 571/tritonserver +tcp6 0 0 :::8001 :::* LISTEN 571/tritonserver +tcp6 0 0 :::8002 :::* LISTEN 571/tritonserver +``` + +And can be tested via `curl`, for example: +``` +$ curl -6 --verbose "http://[::1]:8000/v2/health/ready" +* Trying ::1:8000... +* TCP_NODELAY set +* Connected to ::1 (::1) port 8000 (#0) +> GET /v2/health/ready HTTP/1.1 +> Host: [::1]:8000 +> User-Agent: curl/7.68.0 +> Accept: */* +> +* Mark bundle as not supporting multiuse +< HTTP/1.1 200 OK +< Content-Length: 0 +< Content-Type: text/plain +< +* Connection #0 to host ::1 left intact +``` + + +## Mapping Triton Server Error Codes to HTTP Status Codes + +This table maps various Triton Server error codes to their corresponding HTTP status +codes. It can be used as a reference guide for understanding how Triton Server errors +are handled in HTTP responses. + + +| Triton Server Error Code | HTTP Status Code | Description | +| ----------------------------------------------| -------------------| ---------------------| +| `TRITONSERVER_ERROR_INTERNAL` | 500 | Internal Server Error| +| `TRITONSERVER_ERROR_NOT_FOUND` | 404 | Not Found | +| `TRITONSERVER_ERROR_UNAVAILABLE` | 503 | Service Unavailable | +| `TRITONSERVER_ERROR_UNSUPPORTED` | 501 | Not Implemented | +| `TRITONSERVER_ERROR_UNKNOWN`,
`TRITONSERVER_ERROR_INVALID_ARG`,
`TRITONSERVER_ERROR_ALREADY_EXISTS`,
`TRITONSERVER_ERROR_CANCELLED` | `400` | Bad Request (default for other errors) | diff --git a/docs/protocol/extension_binary_data.md b/docs/protocol/extension_binary_data.md new file mode 100644 index 0000000000..d04edda28b --- /dev/null +++ b/docs/protocol/extension_binary_data.md @@ -0,0 +1,216 @@ + + +# Binary Tensor Data Extension + +This document describes Triton's binary tensor data extension. The +binary tensor data extension allows Triton to support tensor data +represented in a binary format in the body of an HTTP/REST +request. Because this extension is supported, Triton reports +“binary_tensor_data†in the extensions field of its Server Metadata. + +## Binary Tensor Request + +Tensor data represented as binary data is organized in little-endian +byte order, row major, without stride or padding between elements. All +tensor data types are representable as binary data in the native size +of the data type. For BOOL type element true is a single byte with +value 1 and false is a single byte with value 0. For BYTES type an +element is represented by a 4-byte unsigned integer giving the length +followed by the actual bytes. The binary data for a tensor is +delivered in the HTTP body after the JSON object (see Examples). + +The binary tensor data extension uses parameters to indicate that an +input or output tensor is communicated as binary data. The first +parameter is used in `$request_input` and `$response_output` to indicate +that the input or output tensor is communicated as binary data: + +- "binary_data_size" : int64 parameter indicating the size of the + tensor binary data, in bytes. + +The second parameter is used in `$request_output` to indicate that the +output should be returned from Triton as binary data. + +- "binary_data" : bool parameter that is true if the output should be + returned as binary data and false (or not given) if the tensor + should be returned as JSON. + +The third parameter is used in $inference_request to indicate that all +outputs should be returned from Triton as binary data, unless +overridden by "binary_data" on a specific output. + +- "binary_data_output" : bool parameter that is true if all outputs + should be returned as binary data and false (or not given) if the + outputs should be returned as JSON. If "binary_data" is specified on + an output it overrides this setting. + +When one or more tensors are communicated as binary data, the HTTP +body of the request or response will contain the JSON inference +request or response object followed by the binary tensor data in the +same order as the order of the input or output tensors are specified +in the JSON. If any binary data is present in the request or response +the Inference-Header-Content-Length header must be provided to give +the length of the JSON object, and Content-Length continues to give +the full body length (as HTTP requires). + +### Examples + +For the following request the input tensors are sent as binary data +and the output tensor must be returned as binary data as that is what +is requested. Also note that the total size of the binary data is 19 +bytes and that size must be reflected in the content length headers. + +``` +POST /v2/models/mymodel/infer HTTP/1.1 +Host: localhost:8000 +Content-Type: application/octet-stream +Inference-Header-Content-Length: +Content-Length: +{ + "model_name" : "mymodel", + "inputs" : [ + { + "name" : "input0", + "shape" : [ 2, 2 ], + "datatype" : "UINT32", + "parameters" : { + "binary_data_size" : 16 + } + }, + { + "name" : "input1", + "shape" : [ 3 ], + "datatype" : "BOOL", + "parameters" : { + "binary_data_size" : 3 + } + } + ], + "outputs" : [ + { + "name" : "output0", + "parameters" : { + "binary_data" : true + } + } + ] +} +<16 bytes of data for input0 tensor> +<3 bytes of data for input1 tensor> +``` + +Assuming the model returns a [ 3, 2 ] tensor of data type FP32 the +following response would be returned. + +``` +HTTP/1.1 200 OK +Content-Type: application/octet-stream +Inference-Header-Content-Length: +Content-Length: +{ + "outputs" : [ + { + "name" : "output0", + "shape" : [ 3, 2 ], + "datatype" : "FP32", + "parameters" : { + "binary_data_size" : 24 + } + } + ] +} +<24 bytes of data for output0 tensor> +``` + +## Raw Binary Request + +For models whose tensor metadata can be deduced from the byte size of the binary +data. User may send the binary tensor request without specifying inference +header. In other words, the request body only contains the binary data of the +tensor. Below is the constraints for the qualified models: + +1. Only has 1 input +2. If the input data type is non-BYTE, the number of variable size dimensions is +at most 1. If the data type is BYTE, the shape must be [1]. The supported data +types can be found [here](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#tensor-data-types) + +To send a raw binary request, the Inference-Header-Content-Length header must be +provided with value 0 to indicate that the request body doesn't include the +inference header. + +Note: if the model supports batching, the request will be treated as batch-1 +request because the inference header is omitted. Additionally, all the model +output will be requested to be returned in binary tensor form as described in +the previous section. + +### Examples + +The following is the example of sending raw binary request. Note that the total +size of the binary data is 16 bytes and that size must be reflected in +the content length headers. + +``` +POST /v2/models/mymodel/infer HTTP/1.1 +Host: localhost:8000 +Content-Type: application/octet-stream +Inference-Header-Content-Length: 0 +Content-Length: 16 +<16 bytes of data for input tensor> +``` + +Assuming the model returns two outputs which both has shape [ 3, 1 ] and data +type FP32, then the following response would be returned. + +``` +HTTP/1.1 200 OK +Content-Type: application/octet-stream +Inference-Header-Content-Length: +Content-Length: +{ + "outputs" : [ + { + "name" : "output0", + "shape" : [ 3, 1 ], + "datatype" : "FP32", + "parameters" : { + "binary_data_size" : 12 + } + }, + { + "name" : "output1", + "shape" : [ 3, 1 ], + "datatype" : "FP32", + "parameters" : { + "binary_data_size" : 12 + } + } + ] +} +<12 bytes of data for output0 tensor> +<12 bytes of data for output1 tensor> +``` \ No newline at end of file diff --git a/docs/protocol/extension_classification.md b/docs/protocol/extension_classification.md new file mode 100644 index 0000000000..5c481e16a7 --- /dev/null +++ b/docs/protocol/extension_classification.md @@ -0,0 +1,200 @@ + + +# Classification Extension + +This document describes Triton's classification extension. The +classification extension allows Triton to return an output as a +classification index and (optional) label instead of returning the +output as raw tensor data. Because this extension is supported, +Triton reports “classification†in the extensions field of its Server +Metadata. + +An inference request can use the “classification†parameter to request +that one or more classifications be returned for an output. For such +an output the returned tensor will not be the shape and type produced +by the model, but will instead be type BYTES with shape [ batch-size, +\ ] where each element returns the classification index and +label as a single string. The \ dimension of the returned tensor +will equal the “count†value specified in the classification +parameter. + +When the classification parameter is used, Triton will determine the +top-n classifications as the n highest-valued elements in the output +tensor compared using the output tensor’s data type. For example, if +an output tensor is [ 1, 5, 10, 4 ], the highest-valued element is 10 +(index 2), followed by 5 (index 1), followed by 4 (index 3), followed +by 1 (index 0). So, for example, the top-2 classifications by index +are [ 2, 1 ]. + +The format of the returned string will be “\:\[:\]â€, +where \ is the index of the class in the model output tensor, +\ is the value associated with that index in the model output, +and the \ associated with that index is optional. For example, +continuing the example from above, the returned tensor will be [ +“10:2â€, “5:1†]. If the model has labels associated with those +indices, the returned tensor will be [ “10:2:appleâ€, “5:1:pickle†]. + +## HTTP/REST + +In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. #optional +indicates an optional JSON field. + +The classification extension requires that the “classification†+parameter, when applied to a requested inference output, be recognized +by Triton as follows: + +- “classification†: `$number` indicating the number of classes that + should be returned for the output. + +The following example shows how the classification parameter is used +in an inference request. + +``` +POST /v2/models/mymodel/infer HTTP/1.1 +Host: localhost:8000 +Content-Type: application/json +Content-Length: +{ + "id" : "42", + "inputs" : [ + { + "name" : "input0", + "shape" : [ 2, 2 ], + "datatype" : "UINT32", + "data" : [ 1, 2, 3, 4 ] + } + ], + "outputs" : [ + { + "name" : "output0", + "parameters" : { "classification" : 2 } + } + ] +} +``` + +For the above request Triton will return the “output0†output tensor +as a STRING tensor with shape [ 2 ]. Assuming the model produces +output0 tensor [ 1.1, 3.3, 0.5, 2.4 ] from the above inputs, the +response will be the following. + +``` +HTTP/1.1 200 OK +Content-Type: application/json +Content-Length: +{ + "id" : "42" + "outputs" : [ + { + "name" : "output0", + "shape" : [ 2 ], + "datatype" : "STRING", + "data" : [ "3.3:1", "2.4:3" ] + } + ] +} +``` + +If the model has labels associated with each classification index +Triton will return those as well, as shown below. + +``` +HTTP/1.1 200 OK +Content-Type: application/json +Content-Length: +{ + "id" : "42" + "outputs" : [ + { + "name" : "output0", + "shape" : [ 2 ], + "datatype" : "STRING", + "data" : [ "3.3:1:index_1_label", "2.4:3:index_3_label" ] + } + ] +} +``` + +## GRPC + +The classification extension requires that the “classification†+parameter, when applied to a requested inference output, be recognized +by Triton as follows: + +- “classification†: int64_param indicating the number of classes that + should be returned for the output. + +The following example shows how the classification parameter is used +in an inference request. + +``` +ModelInferRequest { + model_name : "mymodel" + model_version : -1 + inputs [ + { + name : "input0" + shape : [ 2, 2 ] + datatype : "UINT32" + contents { int_contents : [ 1, 2, 3, 4 ] } + } + ] + outputs [ + { + name : "output0" + parameters [ + { + key : "classification" + value : { int64_param : 2 } + } + ] + } + ] +} +``` + +For the above request Triton will return the “output0†output tensor +as a STRING tensor with shape [ 2 ]. Assuming the model produces +output0 tensor [ 1.1, 3.3, 0.5, 2.4 ] from the above inputs, the +response will be the following. + +``` +ModelInferResponse { + model_name : "mymodel" + outputs [ + { + name : "output0" + shape : [ 2 ] + datatype : "STRING" + contents { bytes_contents : [ "3.3:1", "2.4:3" ] } + } + ] +} +``` diff --git a/docs/protocol/extension_generate.md b/docs/protocol/extension_generate.md new file mode 100644 index 0000000000..043339eb4a --- /dev/null +++ b/docs/protocol/extension_generate.md @@ -0,0 +1,194 @@ + + +# Generate Extension + +> [!NOTE] +> The Generate Extension is *provisional* and likely to change in future versions. + +This document describes Triton's generate extension. The generate +extension provides a simple text-oriented endpoint schema for interacting with +large language models (LLMs). The generate endpoint is specific to HTTP/REST +frontend. + +## HTTP/REST + +In all JSON schemas shown in this document, `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. #optional +indicates an optional JSON field. + +Triton exposes the generate endpoint at the following URLs. The client may use +HTTP POST request to different URLs for different response behavior, the +endpoint will return the generate results on success or an error in the case of +failure. + +``` +POST v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate + +POST v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream +``` + +### generate vs. generate_stream + +Both URLs expect the same request JSON object, and generate the same JSON +response object. However, there are some differences in the format used to +return each: +* `/generate` returns exactly 1 response JSON object with a +`Content-Type` of `application/json` +* `/generate_stream` may return multiple responses based on the inference +results, with a `Content-Type` of `text/event-stream; charset=utf-8`. +These responses will be sent as +[Server-Sent Events](https://html.spec.whatwg.org/multipage/server-sent-events.html#server-sent-events) +(SSE), where each response will be a "data" chunk in the HTTP +response body. In the case of inference errors, responses will have +an [error JSON object](#generate-response-json-error-object). + * Note that the HTTP response code is set in the first response of the SSE, + so if the first response succeeds but an error occurs in a subsequent + response for the request, it can result in receiving an error object + while the status code shows success (200). Therefore, the user must + always check whether an error object is received when generating + responses through `/generate_stream`. + * If the request fails before inference begins, then a JSON error will + be returned with `Content-Type` of `application/json`, similar to errors + from other endpoints with the status code set to an error. + +### Generate Request JSON Object + +The generate request object, identified as *$generate_request*, is +required in the HTTP body of the POST request. The model name and +(optionally) version must be available in the URL. If a version is not +provided, the server may choose a version based on its own policies or +return an error. + + $generate_request = + { + "id" : $string, #optional + "text_input" : $string, + "parameters" : $parameters #optional + } + +* "id": An identifier for this request. Optional, but if specified this identifier must be returned in the response. +* "text_input" : The text input that the model should generate output from. +* "parameters" : An optional object containing zero or more parameters for this + generate request expressed as key/value pairs. See + [Parameters](#parameters) for more information. + +> [!NOTE] +> Any additional properties in the request object are passed either as +> parameters or tensors based on model specification. + +#### Parameters + +The `$parameters` JSON describes zero or more “nameâ€/â€value†pairs, +where the “name†is the name of the parameter and the “value†is a +`$string`, `$number`, or `$boolean`. + + $parameters = + { + $parameter, ... + } + + $parameter = $string : $string | $number | $boolean + +Parameters are model-specific. The user should check with the model +specification to set the parameters. + +#### Example Request + +Below is an example to send generate request with additional model parameters `stream` and `temperature`. + +``` +$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"id": "42", "text_input": "client input", "parameters": {"stream": false, "temperature": 0}}' + +POST /v2/models/mymodel/generate HTTP/1.1 +Host: localhost:8000 +Content-Type: application/json +Content-Length: +{ + "id" : "42", + "text_input" : "client input", + "parameters" : + { + "stream": false, + "temperature": 0 + } +} +``` + +### Generate Response JSON Object + +A successful generate request is indicated by a 200 HTTP status code. +The generate response object, identified as `$generate_response`, is returned in +the HTTP body. + + $generate_response = + { + "id" : $string + "model_name" : $string, + "model_version" : $string, + "text_output" : $string + } + +* "id" : The "id" identifier given in the request, if any. +* "model_name" : The name of the model used for inference. +* "model_version" : The specific model version used for inference. +* "text_output" : The output of the inference. + +#### Example Response + +``` +200 +{ + "id" : "42" + "model_name" : "mymodel", + "model_version" : "1", + "text_output" : "model output" +} +``` + +### Generate Response JSON Error Object + +A failed generate request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$generate_error_response` object. + + $generate_error_response = + { + "error": + } + +* “error†: The descriptive message for the error. + +#### Example Error + +``` +400 +{ + "error" : "error message" +} +``` diff --git a/docs/protocol/extension_logging.md b/docs/protocol/extension_logging.md new file mode 100644 index 0000000000..87fd5e2c25 --- /dev/null +++ b/docs/protocol/extension_logging.md @@ -0,0 +1,275 @@ + + +# Logging Extension + +This document describes Triton's logging extension. The logging extension enables +the client to configure log settings during a Triton run. Triton reports "logging" +in the extensions field of its Server Metadata. + +## HTTP/REST + +In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. #optional +indicates an optional JSON field. + +Triton exposes the logging endpoint at the following URL. The client may use +HTTP GET request to retrieve the current log settings. A HTTP POST request +will modify the log settings, and the endpoint will return the updated log +settings on success or an error in the case of failure. + +``` +GET v2/logging + +POST v2/logging +``` + +### Log Setting Response JSON Object + +A successful log setting request is indicated by a 200 HTTP status +code. The response object, identified as `$log_setting_response`, is +returned in the HTTP body for every successful log setting request. + +``` +$log_setting_response = +{ + $log_setting, ... +} + +$log_setting = $string : $string | $boolean | $number +``` + +Each `$log_setting` JSON describes a “nameâ€/â€value†pair, where the “name†is +the `$string` representation of the log setting and the “value†is a `$string`, +`$bool`, or `$number` representation of the setting value. Currently, the +following log settings are defined: + +- "log_file" : a `$string` log file location where the log outputs will be saved. If empty, log outputs are streamed to the console. + +- "log_info" : a `$boolean` parameter that controls whether the Triton server logs INFO level messages. + +- "log_warning" : a `$boolean` parameter that controls whether the Triton server logs WARNING level messages. + +- "log_error" : a `$boolean` parameter that controls whether the Triton server logs ERROR level messages. + +- "log_verbose_level" : a `$number` parameter that controls whether the Triton server outputs verbose messages +of varying degrees. This value can be any integer >= 0. If "log_verbose_level" is 0, verbose logging will be disabled, and +no verbose messages will be output by the Triton server. If "log_verbose_level" is 1, level 1 verbose messages will be output +by the Triton server. If "log_verbose_level" is 2, the Triton server will output all verbose messages of +level <= 2, etc. Attempting to set "log_verbose_level" to a number < 0 will result in an error. + +- "log_format" : a `$string` parameter that controls the format of Triton server log messages. There are currently +2 formats: "default" and "ISO8601". + + +### Log Setting Response JSON Error Object + +A failed log setting request will be indicated by an HTTP error status +(typically 400). The HTTP body will contain a `$log_setting_error_response` object. + +``` +$log_setting_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +### Log Setting Request JSON Object + +A log setting request is made with a HTTP POST to +the logging endpoint. In the corresponding response, the HTTP body contains the +response JSON. A successful request is indicated by a 200 HTTP status code. + +The request object, identified as `$log_setting_request` must be provided in the HTTP +body. + +``` +$log_setting_request = +{ + $log_setting, ... +} +``` + +When a `$log_setting` JSON is received (defined above), only the +specified settings will be updated. Currently, the following log +settings (described above) can be updated: +- "log_info" +- "log_warning" +- "log_error" +- "log_verbose_level" +- "log_format" + +### Example Usage +The logging protocol extension can be invoked using the curl library in the following manner (assuming +a Triton server is running at `localhost:8000`): +``` +curl -s -w '\n%{http_code}\n' -d '{"log_verbose_level":1}' -X POST localhost:8000/v2/logging +``` +This command should return a `$log_setting_response` JSON object with the following format: +``` +{"log_file":"","log_info":true,"log_warnings":true,"log_errors":true,"log_verbose_level":1,"log_format":"default"} +200 +``` +Note that the current values for all parameter fields are returned even though `log_verbose_level` +was the only parameter that was modified. + +## GRPC + +For the logging extension, Triton implements the following API: + +``` +service GRPCInferenceService +{ + … + + // Update and get the log setting of the Triton server. + rpc LogSettings(LogSettingsRequest) + returns (LogSettingsResponse) {} +} +``` + +The Log Setting API returns the latest log settings. Errors are indicated +by the `google.rpc.Status` returned for the request. The OK code +indicates success and other codes indicate failure. The request and +response messages for Log Settings are: + +``` +message LogSettingsRequest +{ + message SettingValue + { + oneof parameter_choice + { + // bool param option + bool bool_param = 1; + + // uint32 param option + uint32 uint32_param = 2; + + // string param option + string string_param = 3; + } + } + // The new setting values to be updated. + // Unspecified settings will remain unchanged. + map settings = 1; +} + +message LogSettingsResponse +{ + message SettingValue + { + oneof parameter_choice + { + // bool param option + bool bool_param = 1; + + // uint32 param option + uint32 uint32_param = 2; + + // string param option + string string_param = 3; + } + } + // The latest log settings values. + map settings = 1; +} +``` + +## Logging Formats + +The logging extension offers two logging formats. The formats have a +common set of fields but differ in how the timestamp for a log entry +is represented. Messages are serialized according to JSON encoding +rules by default. This behavior can be disabled by setting the +environment variable TRITON_SERVER_ESCAPE_LOG_MESSAGES to "0" when +launching the server but can not be changed through the logging +extension. + +Log entries can be single-line or multi-line. Multi-line entries have +a single optional heading followed by the structured representation of +an object such as a table or protobuf message. Multi-line entries end +when the next log entry begins. + +1. TRITONSERVER_LOG_DEFAULT + +### Single-line Entry +``` +::. :] +``` +Example: +``` +I0520 20:03:25.829575 3355 model_lifecycle.cc:441] "AsyncLoad() 'simple'" +``` +### Multi-line Entry +``` +::. :] + +``` +Example: + +``` +I0520 20:03:25.912303 3355 server.cc:676] ++--------+---------+--------+ +| Model | Version | Status | ++--------+---------+--------+ +| simple | 1 | READY | ++--------+---------+--------+ +``` + + +2. TRITONSERVER_LOG_ISO8601 + +### Single-line Entry +``` +--T::Z :] +``` + +Example: +``` +2024-05-20T20:03:26Z I 3415 model_lifecycle.cc:441] "AsyncLoad() 'simple'" +``` + +### Multi-line Entry +``` +--T::Z :] + +``` + +Example: + +``` +2024-05-20T20:03:26Z I 3415 server.cc:676] ++--------+---------+--------+ +| Model | Version | Status | ++--------+---------+--------+ +| simple | 1 | READY | ++--------+---------+--------+ +``` diff --git a/docs/protocol/extension_model_configuration.md b/docs/protocol/extension_model_configuration.md new file mode 100644 index 0000000000..04a2d28fac --- /dev/null +++ b/docs/protocol/extension_model_configuration.md @@ -0,0 +1,118 @@ + + +# Model Configuration Extension + +This document describes Triton's model configuration extension. The +model configuration extension allows Triton to return server-specific +information. Because this extension is supported, Triton reports +“model_configuration†in the extensions field of its Server Metadata. + +## HTTP/REST + +In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. #optional +indicates an optional JSON field. + +Triton exposes the model configuration endpoint at the following +URL. The versions portion of the URL is optional; if not provided +Triton will return model configuration for the highest-numbered +version of the model. + +``` +GET v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/config +``` + +A model configuration request is made with an HTTP GET to the model +configuration endpoint.A successful model configuration request is +indicated by a 200 HTTP status code. The model configuration response +object, identified as `$model_configuration_response`, is returned in +the HTTP body for every successful request. + +``` +$model_configuration_response = +{ + # configuration JSON +} +``` + +The contents of the response will be the JSON representation of the +model's configuration described by the [ModelConfig message from +model_config.proto](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto). + +A failed model configuration request must be indicated by an HTTP +error status (typically 400). The HTTP body must contain the +`$model_configuration_error_response` object. + +``` +$model_configuration_error_response = +{ + "error": +} +``` + +- “error†: The descriptive message for the error. + +## GRPC + +The GRPC definition of the service is: + +``` +service GRPCInferenceService +{ + … + + // Get model configuration. + rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {} +} +``` + +Errors are indicated by the google.rpc.Status returned for the +request. The OK code indicates success and other codes indicate +failure. The request and response messages for ModelConfig are: + +``` +message ModelConfigRequest +{ + // The name of the model. + string name = 1; + + // The version of the model. If not given the version of the model + // is selected automatically based on the version policy. + string version = 2; +} + +message ModelConfigResponse +{ + // The model configuration. + ModelConfig config = 1; +} +``` + +Where the ModelConfig message is defined in +[model_config.proto](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto). diff --git a/docs/protocol/extension_model_repository.md b/docs/protocol/extension_model_repository.md new file mode 100644 index 0000000000..b0009043f5 --- /dev/null +++ b/docs/protocol/extension_model_repository.md @@ -0,0 +1,400 @@ + + +# Model Repository Extension + +This document describes Triton's model repository extension. The +model-repository extension allows a client to query and control the +one or more model repositories being served by Triton. Because this +extension is supported, Triton reports “model_repository†in the +extensions field of the Server Metadata. This extension has an +optional component, described below, that allows the unload API to +specify the "unload_dependents" parameter. Versions of Triton that +support this optional component will also report +"model_repository(unload_dependents)" in the extensions field of the +Server Metadata. + +## HTTP/REST + +In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. `#optional` +indicates an optional JSON field. + +The model-repository extension requires Index, Load and Unload +APIs. Triton exposes the endpoints at the following URLs. + +``` +POST v2/repository/index + +POST v2/repository/models/${MODEL_NAME}/load + +POST v2/repository/models/${MODEL_NAME}/unload +``` + +### Index + +The index API returns information about every model available in a +model repository, even if it is not currently loaded into Triton. The +index API provides a way to determine which models can potentially be +loaded by the Load API. A model-repository index request is made with +an HTTP POST to the index endpoint. In the corresponding response the +HTTP body contains the JSON response. + +The index request object, identified as `$repository_index_request`, is +required in the HTTP body of the POST request. + +``` +$repository_index_request = +{ + "ready" : $boolean #optional, +} +``` + +- "ready" : Optional, default is false. If true return only models ready for inferencing. + +A successful index request is indicated by a 200 HTTP status code. The +response object, identified as `$repository_index_response`, is returned +in the HTTP body for every successful request. + +``` +$repository_index_response = +[ + { + "name" : $string, + "version" : $string #optional, + "state" : $string, + "reason" : $string + }, + … +] +``` + +- “name†: The name of the model. +- “version†: The version of the model. +- “state†: The state of the model. +- “reason†: The reason, if any, that the model is in the current state. + +A failed index request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$repository_index_error_response` object. + +``` +$repository_index_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +### Load + +The load API requests that a model be loaded into Triton, or reloaded +if the model is already loaded. A load request is made with an HTTP +POST to a load endpoint. The HTTP body may be empty or may contain +the load request object, identified as `$repository_load_request`. +A successful load request is indicated by a 200 HTTP status. + + +``` +$repository_load_request = +{ + "parameters" : $parameters #optional +} +``` + +- "parameters" : An object containing zero or more parameters for this + request expressed as key/value pairs. See + [Parameters](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#parameters) + for more information. + +The load API accepts the following parameters: + +- "config" : string parameter that contains a JSON representation of the model +configuration, which must be able to be parsed into [ModelConfig message from +model_config.proto](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto). +This config will be used for loading the model instead of the one in +the model directory. If config is provided, the (re-)load will be triggered as +the model metadata has been updated, and the same (re-)load behavior will be +applied. + +- "file:\/\" : The serialized model file, base64 encoded. +This convention will be used to specify the override model directory to load +the model from. For instance, if the user wants to specify a model directory +that contains an ONNX model as version 2, then the user will specify the +parameter to "file:2/model.onnx" : "\". Note that +"config" parameter must be provided to serve as the model configuration of the +override model directory. + +A failed load request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$repository_load_error_response` object. + +``` +$repository_load_error_response = +{ + "error": $string +} +``` +- “error†: The descriptive message for the error. + +#### Examples + +For the following request, Triton will load the model "mymodel" with provided +model configuration and model file. + +``` +POST /v2/repository/models/mymodel/load HTTP/1.1 +Host: localhost:8000 +{ + "parameters": { + "config": "{ + "name": "mymodel", + "backend": "onnxruntime", + "inputs": [{ + "name": "INPUT0", + "datatype": "FP32", + "shape": [ 1 ] + } + ], + "outputs": [{ + "name": "OUTPUT0", + "datatype": "FP32", + "shape": [ 1 ] + } + ] + }", + + "file:1/model.onnx" : "" + } +} +``` + +### Unload + +The unload API requests that a model be unloaded from Triton. An +unload request is made with an HTTP POST to an unload endpoint. The +HTTP body may be empty or may contain the unload request object, +identified as `$repository_unload_request`. A successful unload request +is indicated by a 200 HTTP status. + +``` +$repository_unload_request = +{ + "parameters" : $parameters #optional +} +``` + +- "parameters" : An object containing zero or more parameters for this + request expressed as key/value pairs. See + [Parameters](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#parameters) + for more information. + +The unload API accepts the following parameters: + +- "unload_dependents" : boolean parameter indicating that in addition + to unloading the requested model, also unload any dependent model + that was loaded along with the requested model. For example, request to + unload the models composing an ensemble will unload the ensemble as well. + +A failed unload request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$repository_unload_error_response` object. + +``` +$repository_unload_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +## GRPC + +The model-repository extension requires the following API: + +``` +service GRPCInferenceService +{ + … + + // Get the index of model repository contents. + rpc RepositoryIndex(RepositoryIndexRequest) + returns (RepositoryIndexResponse) {} + + // Load or reload a model from a repository. + rpc RepositoryModelLoad(RepositoryModeLoadRequest) + returns (RepositoryModelLoadResponse) {} + + // Unload a model. + rpc RepositoryModelUnload(RepositoryModelUnloadRequest) + returns (RepositoryModelUnloadResponse) {} +} + +message ModelRepositoryParameter +{ + // The parameter value can be a string, an int64, a boolean + // or a message specific to a predefined parameter. + oneof parameter_choice + { + // A boolean parameter value. + bool bool_param = 1; + + // An int64 parameter value. + int64 int64_param = 2; + + // A string parameter value. + string string_param = 3; + + // A bytes parameter value. + bytes bytes_param = 4; + } +} +``` + +### Index + +The RepositoryIndex API returns information about every model +available in a model repository, even if it is not currently loaded +into Triton. Errors are indicated by the google.rpc.Status returned +for the request. The OK code indicates success and other codes +indicate failure. The request and response messages for +RepositoryIndex are: + +``` +message RepositoryIndexRequest +{ + // The name of the repository. If empty the index is returned + // for all repositories. + string repository_name = 1; + + // If true return only models currently ready for inferencing. + bool ready = 2; +} + +message RepositoryIndexResponse +{ + // Index entry for a model. + message ModelIndex { + // The name of the model. + string name = 1; + + // The version of the model. + string version = 2; + + // The state of the model. + string state = 3; + + // The reason, if any, that the model is in the given state. + string reason = 4; + } + + // An index entry for each model. + repeated ModelIndex models = 1; +} +``` + +### Load + +The RepositoryModelLoad API requests that a model be loaded into +Triton, or reloaded if the model is already loaded. Errors are +indicated by the google.rpc.Status returned for the request. The OK +code indicates success and other codes indicate failure. The request +and response messages for RepositoryModelLoad are: + +``` +message RepositoryModelLoadRequest +{ + // The name of the repository to load from. If empty the model + // is loaded from any repository. + string repository_name = 1; + + // The name of the model to load, or reload. + string model_name = 2; + + // Optional parameters. + map parameters = 3; +} + +message RepositoryModelLoadResponse +{ +} +``` + +The RepositoryModelLoad API accepts the following parameters: + +- "config" : string parameter that contains a JSON representation of the model +configuration, which must be able to be parsed into [ModelConfig message from +model_config.proto](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto). +This config will be used for loading the model instead of the one in +the model directory. If config is provided, the (re-)load will be triggered as +the model metadata has been updated, and the same (re-)load behavior will be +applied. + +- "file:\/\" : bytes parameter that contains the model +file content. This convention will be used to specify the override model +directory to load the model from. For instance, if the user wants to specify a +model directory that contains an ONNX model as version 2, then the user will +specify the parameter to "file:2/model.onnx" : "\". Note that +"config" parameter must be provided to serve as the model configuration of the +override model directory. + +### Unload + +The RepositoryModelUnload API requests that a model be unloaded from +Triton. Errors are indicated by the google.rpc.Status returned for the +request. The OK code indicates success and other codes indicate +failure. The request and response messages for RepositoryModelUnload +are: + +``` +message RepositoryModelUnloadRequest +{ + // The name of the repository from which the model was originally + // loaded. If empty the repository is not considered. + string repository_name = 1; + + // The name of the model to unload. + string model_name = 2; + + // Optional parameters. + map parameters = 3; +} + +message RepositoryModelUnloadResponse +{ +} +``` + +The RepositoryModelUnload API accepts the following parameters: + +- "unload_dependents" : boolean parameter indicating that in addition + to unloading the requested model, also unload any dependent model + that was loaded along with the requested model. For example, request to + unload the models composing an ensemble will unload the ensemble as well. \ No newline at end of file diff --git a/docs/protocol/extension_parameters.md b/docs/protocol/extension_parameters.md new file mode 100644 index 0000000000..14ed4d1dc5 --- /dev/null +++ b/docs/protocol/extension_parameters.md @@ -0,0 +1,110 @@ + + +# Parameters Extension + +This document describes Triton's parameters extension. The +parameters extension allows an inference request to provide +custom parameters that cannot be provided as inputs. Because this extension is +supported, Triton reports “parameters†in the extensions field of its +Server Metadata. This extension uses the optional "parameters" +field in the KServe Protocol in +[HTTP](https://kserve.github.io/website/0.10/modelserving/data_plane/v2_protocol/#inference-request-json-object) +and +[GRPC](https://kserve.github.io/website/0.10/modelserving/data_plane/v2_protocol/#parameters). + +The following parameters are reserved for Triton's usage and should not be +used as custom parameters: + +- sequence_id +- priority +- timeout +- sequence_start +- sequence_end +- headers +- All the keys that start with `"triton_"` prefix. Some examples used today: + - `"triton_enable_empty_final_response"` request parameter + - `"triton_final_response"` response parameter + +When using both GRPC and HTTP endpoints, you need to make sure to not use +the reserved parameters list to avoid unexpected behavior. The reserved +parameters are not accessible in the Triton C-API. + +## HTTP/REST + +The following example shows how a request can include custom parameters. + +``` +POST /v2/models/mymodel/infer HTTP/1.1 +Host: localhost:8000 +Content-Type: application/json +Content-Length: +{ + "parameters" : { "my_custom_parameter" : 42 } + "inputs" : [ + { + "name" : "input0", + "shape" : [ 2, 2 ], + "datatype" : "UINT32", + "data" : [ 1, 2, 3, 4 ] + } + ], + "outputs" : [ + { + "name" : "output0", + } + ] +} +``` + +## GRPC + +The `parameters` field in the +ModelInferRequest message can be used to send custom parameters. + +## Forwarding HTTP/GRPC Headers as Parameters + +Triton can forward HTTP/GRPC headers as inference request parameters. By +specifying a regular expression in `--http-header-forward-pattern` and +`--grpc-header-forward-pattern`, +Triton will add the headers that match with the regular expression as request +parameters. All the forwarded headers will be added as a parameter with string +value. For example to forward all the headers that start with 'PREFIX_' from +both HTTP and GRPC, you should add `--http-header-forward-pattern PREFIX_.* +--grpc-header-forward-pattern PREFIX_.*` to your `tritonserver` command. + +By default, the regular expression pattern matches headers with case-insensitive +mode according to the HTTP protocol. If you want to enforce case-sensitive mode, +simplying adding the `(?-i)` prefix which turns off case-insensitive mode, e.g. +`--http-header-forward-pattern (?-i)PREFIX_.*`. Note, headers sent through the +Python HTTP client may be automatically lower-cased by internal client libraries. + +The forwarded headers can be accessed using the +[Python](https://github.com/triton-inference-server/python_backend#inference-request-parameters) +or C Backend APIs as inference request parameters. + diff --git a/docs/protocol/extension_schedule_policy.md b/docs/protocol/extension_schedule_policy.md new file mode 100644 index 0000000000..c3c57a63c7 --- /dev/null +++ b/docs/protocol/extension_schedule_policy.md @@ -0,0 +1,81 @@ + + +# Schedule Policy Extension + +This document describes Triton's schedule policy extension. The +schedule-policy extension allows an inference request to provide +parameters that influence how Triton handles and schedules the +request. Because this extension is supported, Triton reports +“schedule_policy†in the extensions field of its Server Metadata. +Note the policies are specific to [dynamic +batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher) +and only experimental support to [sequence +batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#sequence-batcher) +with the [direct](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#direct) +scheduling strategy. + +## Dynamic Batcher + +The schedule-policy extension uses request parameters to indicate the +policy. The parameters and their type are: + +- "priority" : int64 value indicating the priority of the + request. Priority value zero indicates that the default priority + level should be used (i.e. same behavior as not specifying the + priority parameter). Lower value priorities indicate higher priority + levels. Thus the highest priority level is indicated by setting the + parameter to 1, the next highest is 2, etc. + +- "timeout" : int64 value indicating the timeout value for the + request, in microseconds. If the request cannot be completed within + the time Triton will take a model-specific action such as + terminating the request. + +Both parameters are optional and, if not specified, Triton will handle +the request using the default priority and timeout values appropriate +for the model. + +## Sequence Batcher with Direct Scheduling Strategy + +**Note that the schedule policy for sequence batcher is at experimental stage +and it is subject to change.** + +The schedule-policy extension uses request parameters to indicate the +policy. The parameters and their type are: + +- "timeout" : int64 value indicating the timeout value for the + request, in microseconds. If the request cannot be completed within + the time Triton will terminate the request, as well as the corresponding + sequence and received requests of the sequence. The timeout will only be + applied to requests of the sequences that haven't been allocated a batch slot + for execution, the requests of the sequences that have been allocated batch + slots will not be affected by the timeout setting. + +The parameter is optional and, if not specified, Triton will handle +the request and corresponding sequence based on the model configuration. \ No newline at end of file diff --git a/docs/protocol/extension_sequence.md b/docs/protocol/extension_sequence.md new file mode 100644 index 0000000000..3836d06fce --- /dev/null +++ b/docs/protocol/extension_sequence.md @@ -0,0 +1,147 @@ + + +# Sequence Extension + +This document describes Triton's sequence extension. The sequence +extension allows Triton to support stateful models that expect a +sequence of related inference requests. + +An inference request can specify that it is part of a sequence using +the “sequence_id†parameter in the request and by using the +“sequence_start†and “sequence_end†parameters to indicate the start +and end of sequences. + +Because this extension is supported, Triton reports "sequence" +in the extensions field of its Server Metadata. Triton may additionally +report "sequence(string_id)" in the extensions field of the Server Metadata +if the "sequence_id" parameter supports string types. + +- "sequence_id" : a string or uint64 value that identifies the sequence to which + a request belongs. All inference requests that belong to the same sequence + must use the same sequence ID. A sequence ID of 0 or "" indicates the + inference request is not part of a sequence. + +- "sequence_start" : boolean value if set to true in a request + indicates that the request is the first in a sequence. If not set, + or set to false the request is not the first in a sequence. If set + the "sequence_id" parameter must be set to a non-zero or non-empty string + value. + +- "sequence_end" : boolean value if set to true in a request indicates + that the request is the last in a sequence. If not set, or set to + false the request is not the last in a sequence. If set the + "sequence_id" parameter must be set to a non-zero or non-empty string + value. + +## HTTP/REST + +The following example shows how a request is marked as part of a +sequence. In this case the sequence_start and sequence_end parameters +are not used which means that this request is neither the start nor +end of the sequence. + +``` +POST /v2/models/mymodel/infer HTTP/1.1 +Host: localhost:8000 +Content-Type: application/json +Content-Length: +{ + "parameters" : { "sequence_id" : 42 } + "inputs" : [ + { + "name" : "input0", + "shape" : [ 2, 2 ], + "datatype" : "UINT32", + "data" : [ 1, 2, 3, 4 ] + } + ], + "outputs" : [ + { + "name" : "output0", + } + ] +} +``` + +The example below uses a v4 UUID string as the value for the "sequence_id" +parameter. + +``` +POST /v2/models/mymodel/infer HTTP/1.1 +Host: localhost:8000 +Content-Type: application/json +Content-Length: +{ + "parameters" : { "sequence_id" : "e333c95a-07fc-42d2-ab16-033b1a566ed5" } + "inputs" : [ + { + "name" : "input0", + "shape" : [ 2, 2 ], + "datatype" : "UINT32", + "data" : [ 1, 2, 3, 4 ] + } + ], + "outputs" : [ + { + "name" : "output0", + } + ] +} +``` + +## GRPC + +In addition to supporting the sequence parameters described above, the +GRPC API adds a streaming version of the inference API to allow a +sequence of inference requests to be sent over the same GRPC +stream. This streaming API is not required to be used for requests +that specify a sequence_id and may be used by requests that do not +specify a sequence_id. The ModelInferRequest is the same as for the +ModelInfer API. The ModelStreamInferResponse message is shown below. + +``` +service GRPCInferenceService +{ + … + + // Perform inference using a specific model with GRPC streaming. + rpc ModelStreamInfer(stream ModelInferRequest) returns (stream ModelStreamInferResponse) {} +} + +// Response message for ModelStreamInfer. +message ModelStreamInferResponse +{ + // The message describing the error. The empty message + // indicates the inference was successful without errors. + String error_message = 1; + + // Holds the results of the request. + ModelInferResponse infer_response = 2; +} +``` diff --git a/docs/protocol/extension_shared_memory.md b/docs/protocol/extension_shared_memory.md new file mode 100644 index 0000000000..e9b5898ac9 --- /dev/null +++ b/docs/protocol/extension_shared_memory.md @@ -0,0 +1,590 @@ + + +# Shared-Memory Extension + +This document describes Triton's shared-memory extensions. The +shared-memory extensions allow a client to communicate input and +output tensors by system or CUDA shared memory. Using shared memory +instead of sending the tensor data over the GRPC or REST interface can +provide significant performance improvement for some use cases. +Because both of these extensions are supported, Triton reports +“system_shared_memory†and "cuda_shared_memory" in the extensions +field of its Server Metadata. + +The shared-memory extensions use a common set of parameters to +indicate that an input or output tensor is communicated via shared +memory. These parameters and their type are: + +- "shared_memory_region" : string value is the name of a previously + registered shared memory region. Region names share a namespace for + system-shared-memory regions and CUDA-shared-memory regions. + +- "shared_memory_offset" : int64 value is the offset, in bytes, into + the region where the data for the tensor starts. + +- "shared_memory_byte_size" : int64 value is the size, in bytes, of + the data. + +The “shared_memory_offset†parameter is optional and defaults to +zero. The other two parameters are required. If only one of the two is +given Triton will return an error. + +Note that there is no Windows support for shared memory yet. Jetson only +supports system shared memory. + +## HTTP/REST + +In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. #optional +indicates an optional JSON field. + +The shared-memory parameters may be used in the `$request_input` +parameters to indicate that the corresponding input is being +communicated via shared memory. The parameters may be used in the +`$request_output` parameters to indicate that the requested output +should be communicated via shared memory. + +When these parameters are set for an input tensor the “data†field of +`$request_input` must not be set. If the “data†field is set Triton will +return an error. When these parameters are set for a requested output +tensor the returned `$response_output` must not define the “data†field. + +Shared memory regions must be created by the client and then +registered with Triton before they can be referenced with a +“shared_memory_region†parameter. The system and CUDA shared-memory +extensions each require a different set of APIs for registering a +shared memory region. + +### System Shared Memory + +The system shared memory extension requires Status, Register and +Unregister APIs. + +Triton exposes the following URL to register and unregister system +shared memory regions. + +``` +GET v2/systemsharedmemory[/region/${REGION_NAME}]/status + +POST v2/systemsharedmemory/region/${REGION_NAME}/register + +POST v2/systemsharedmemory[/region/${REGION_NAME}]/unregister +``` + +#### Status + +A system-shared-memory status request is made with an HTTP GET to the +status endpoint. In the corresponding response the HTTP body contains +the response JSON. If REGION_NAME is provided in the URL the response +includes the status for the corresponding region. If REGION_NAME is +not provided in the URL the response includes the status for all +registered regions. + +A successful status request is indicated by a 200 HTTP status +code. The response object, identified as +`$system_shared_memory_status_response`, is returned in the HTTP body +for every successful request. + +``` +$system_shared_memory_status_response = +[ + { + "name" : $string, + "key" : $string, + "offset" : $number, + "byte_size" : $number + }, + … +] +``` + +- “name†: The name of the shared-memory region. + +- “key†: The key of the underlying memory object that contains the + shared memory region. + +- “offset†: The offset, in bytes, within the underlying memory object + to the start of the shared memory region. + +- “byte_size†: The size of the shared memory region, in bytes. + +A failed status request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$system_shared_memory_status_error_response` object. + +``` +$system_shared_memory_status_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +#### Register + +A system-shared-memory register request is made with a HTTP POST to +the register endpoint. In the corresponding response the HTTP body +contains the response JSON. A successful register request is indicated +by a 200 HTTP status code. + +The request object, identified as +`$system_shared_memory_register_request` must be provided in the HTTP +body. + +``` +$system_shared_memory_register_request = +{ + "key" : $string, + "offset" : $number, + "byte_size" : $number +} +``` + +- “key†: The key of the underlying memory object that contains the + shared memory region. + +- “offset†: The offset, in bytes, within the underlying memory object + to the start of the shared memory region. + +- “byte_size†: The size of the shared memory region, in bytes. + +A failed register request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$system_shared_memory_register_error_response` object. + +``` +$system_shared_memory_register_error_response = +{ + "error": $string +} +``` +- “error†: The descriptive message for the error. + +#### Unregister + +A system-shared-memory unregister request is made with an HTTP POST to +an unregister endpoint. In the request the HTTP body must be empty. + +A successful register request is indicated by a 200 HTTP status. If +REGION_NAME is provided in the URL the single region is +unregistered. If REGION_NAME is not provided in the URL all regions +are unregisered. + +A failed unregister request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$system_shared_memory_unregister_error_response` object. + +``` +$system_shared_memory_unregister_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +### CUDA Shared Memory + +The CUDA shared memory extension requires Status, Register and +Unregister APIs. + +Triton exposes the following URL to register and unregister system +shared memory regions. + +``` +GET v2/cudasharedmemory[/region/${REGION_NAME}]/status + +POST v2/cudasharedmemory/region/${REGION_NAME}/register + +POST v2/cudasharedmemory[/region/${REGION_NAME}]/unregister +``` + +#### Status + +A CUDA-shared-memory status request is made with an HTTP GET to the +status endpoint. In the corresponding response the HTTP body contains +the response JSON. If REGION_NAME is provided in the URL the response +includes the status for the corresponding region. If REGION_NAME is +not provided in the URL the response includes the status for all +registered regions. + +A successful status request is indicated by a 200 HTTP status +code. The response object, identified as +`$cuda_shared_memory_status_response`, is returned in the HTTP body +for every successful request. + +``` +$cuda_shared_memory_status_response = +[ + { + "name" : $string, + "device_id" : $number, + "byte_size" : $number + }, + … +] +``` + +- “name†: The name of the shared memory region. + +- “device_id†: The GPU device ID where the cudaIPC handle was + created. + +- “byte_size†: The size of the shared memory region, in bytes. + +A failed status request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$cuda_shared_memory_status_error_response` object. + +``` +$cuda_shared_memory_status_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +#### Register + +A CUDA-shared-memory register request is made with a HTTP POST to +the register endpoint. In the corresponding response the HTTP body +contains the response JSON. A successful register request is indicated +by a 200 HTTP status code. + +The request object, identified as +`$cuda_shared_memory_register_request` must be provided in the HTTP +body. + +``` +$cuda_shared_memory_register_request = +{ + "raw_handle" : { "b64" : $string }, + "device_id" : $number, + "byte_size" : $number +} +``` + +- “raw_handle†: The serialized cudaIPC handle, base64 encoded. + +- “device_id†: The GPU device ID where the cudaIPC handle was + created. + +- “byte_size†: The size of the shared memory region, in bytes. + +A failed register request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$cuda_shared_memory_register_error_response` object. + +``` +$cuda_shared_memory_register_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +#### Unregister + +A CUDA-shared-memory unregister request is made with an HTTP POST to +an unregister endpoint. In the request the HTTP body must be empty. + +A successful register request is indicated by a 200 HTTP status. If +REGION_NAME is provided in the URL the single region is +unregistered. If REGION_NAME is not provided in the URL all regions +are unregisered. + +A failed unregister request must be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$cuda_shared_memory_unregister_error_response` object. + +``` +$cuda_shared_memory_unregister_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +## GRPC + +The shared-memory parameters may be used in the +ModelInferRequest::InferInputTensor message to indicate that the +corresponding input is being communicated via shared memory. The +parameters may be used in the +ModelInferRequest::InferRequestedOutputTensor message to indicate that +the requested output should be communicated via shared memory. + +When these parameters are set for an input tensor the “contents†field +of ModelInferRequest::InferInputTensor must not be set. If the +“contents†field is set Triton will return an error.. When these +parameters are set for a requested output tensor the “contents†field +of the ModelInferResponse::InferOutputTensor will not be set in the +inference response. + +Shared memory regions must be created by the client and then +registered with Triton before they can be referenced with a +“shared_memory_region†parameter. The system and CUDA shared-memory +extensions each require a different set of APIs. For all APIs, errors +are indicated by the google.rpc.Status returned for the request. The +OK code indicates success and other codes indicate failure. + +### System Shared Memory + +The system shared memory extension requires the following API: + +``` +service GRPCInferenceService +{ + … + + // Get the status of all registered system-shared-memory regions. + rpc SystemSharedMemoryStatus(SystemSharedMemoryStatusRequest) + returns (SystemSharedMemoryStatusResponse) {} + + // Register system-shared-memory region. + rpc SystemSharedMemoryRegister(SystemSharedMemoryRegisterRequest) + returns (SystemSharedMemoryRegisterResponse) {} + + // Unregister system-shared-memory region. + rpc SystemSharedMemoryUnregister(SystemSharedMemoryUnregisterRequest) + returns (SystemSharedMemoryUnregisterResponse) {} +} +``` + +#### Status + +The system-shared-memory status API provides information about +registered system shared-memory regions. Errors are indicated by the +google.rpc.Status returned for the request. The OK code indicates +success and other codes indicate failure. The request and response +messages for SystemSharedMemoryStatus are: + +``` +message SystemSharedMemoryStatusRequest +{ + // The name of the region to get status for. If empty the + // status is returned for all registered regions. + string name = 1; +} + +message SystemSharedMemoryStatusResponse +{ + // Status for a shared memory region. + message RegionStatus { + // The name for the shared memory region. + string name = 1; + + // The key of the underlying memory object that contains the + // shared memory region. + string key = 2; + + // Offset, in bytes, within the underlying memory object to + // the start of the shared memory region. + uint64 offset = 3; + + // Size of the shared memory region, in bytes. + uint64 byte_size = 4; + } + + // Status for each of the registered regions, indexed by region name. + map regions = 1; +} +``` + +#### Register + +The system-shared-memory register API is used to register a new +shared-memory region with Triton. After a region is registered it can +be used in the “shared_memory_region†parameter for an input or output +tensor. Errors are indicated by the google.rpc.Status returned for the +request. The OK code indicates success and other codes indicate +failure. The request and response messages for +SystemSharedMemoryRegister are: + +``` +message SystemSharedMemoryRegisterRequest +{ + // The name of the region to register. + string name = 1; + + // The key of the underlying memory object that contains the + // shared memory region. + string key = 2; + + // Offset, in bytes, within the underlying memory object to + // the start of the shared memory region. + uint64 offset = 3; + + // Size of the shared memory region, in bytes. + uint64 byte_size = 4; +} + +message SystemSharedMemoryRegisterResponse +{ +} +``` + +#### Unregister + +The system-shared-memory unregister API provides unregisters a +shared-memory region from Triton. After a region is +unregistered it can no longer be used to communicate input and output +tensor contents. Errors are indicated by the google.rpc.Status +returned for the request. The OK code indicates success and other +codes indicate failure. The request and response messages for +SystemSharedMemoryStatus are: + +``` +message SystemSharedMemoryUnregisterRequest +{ + // The name of the region to unregister. If empty all system shared-memory + // regions are unregistered. + string name = 1; +} + +message SystemSharedMemoryUnregisterResponse +{ +} +``` + +### CUDA Shared Memory + +The CUDA shared memory extension requires the following API: + +``` +service GRPCInferenceService +{ + … + + // Get the status of all registered CUDA-shared-memory regions. + rpc CudaSharedMemoryStatus(CudaSharedMemoryStatusRequest) + returns (CudaSharedMemoryStatusResponse) {} + + // Register CUDA-shared-memory region. + rpc CudaSharedMemoryRegister(CudaSharedMemoryRegisterRequest) + returns (CudaSharedMemoryRegisterResponse) {} + + // Unregister CUDA-shared-memory region. + rpc CudaSharedMemorUnregister(CudaSharedMemoryUnregisterRequest) + returns (CudaSharedMemoryUnregisterResponse) {} +} +``` + +#### Status + +The CUDA-shared-memory status API provides information about +registered CUDA shared-memory regions. Errors are indicated by the +google.rpc.Status returned for the request. The OK code indicates +success and other codes indicate failure. The request and response +messages for CudaSharedMemoryStatus are: + +``` +message CudaSharedMemoryStatusRequest +{ + // The name of the region to get status for. If empty the + // status is returned for all registered regions. + string name = 1; +} + +message CudaSharedMemoryStatusResponse +{ + // Status for a shared memory region. + message RegionStatus { + // The name for the shared memory region. + string name = 1; + + // The GPU device ID where the cudaIPC handle was created. + uint64 device_id = 2; + + // Size of the shared memory region, in bytes. + uint64 byte_size = 3; + } + + // Status for each of the registered regions, indexed by region name. + map regions = 1; +} +``` + +#### Register + +The CUDA-shared-memory register API is used to register a new +shared-memory region with Triton. After a region is +registered it can be used in the “shared_memory_region†parameter for +an input or output tensor. Errors are indicated by the +google.rpc.Status returned for the request. The OK code indicates +success and other codes indicate failure. The request and response +messages for CudaSharedMemoryRegister are: + +``` +message CudaSharedMemoryRegisterRequest +{ + // The name of the region to register. + string name = 1; + + // The raw serialized cudaIPC handle. + bytes raw_handle = 2; + + // The GPU device ID on which the cudaIPC handle was created. + int64 device_id = 3; + + // Size of the shared memory region, in bytes. + uint64 byte_size = 4; +} + +message CudaSharedMemoryRegisterResponse +{ +} +``` + +#### Unregister + +The CUDA-shared-memory unregister API provides unregisters a +shared-memory region from Triton. After a region is unregistered it +can no longer be used to communicate input and output tensor +contents. Errors are indicated by the google.rpc.Status returned for +the request. The OK code indicates success and other codes indicate +failure. The request and response messages for CudaSharedMemoryStatus +are: + +``` +message CudaSharedMemoryUnregisterRequest +{ + // The name of the region to unregister. If empty all CUDA shared-memory + // regions are unregistered. + string name = 1; +} + +message CudaSharedMemoryUnregisterResponse +{ +} +``` diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md new file mode 100644 index 0000000000..4ff956b60a --- /dev/null +++ b/docs/protocol/extension_statistics.md @@ -0,0 +1,526 @@ + + +# Statistics Extension + +This document describes Triton's statistics extension. The statistics +extension enables the reporting of per-model (per-version) statistics +which provide aggregate information about all activity occurring for a +specific model (version) since Triton started. Because this extension +is supported, Triton reports “statistics†in the extensions field of +its Server Metadata. + +## HTTP/REST + +In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. #optional +indicates an optional JSON field. + +Triton exposes the statistics endpoint at the following URL. The +specific model name portion of the URL is optional; if not provided +Triton will return the statistics for all versions of all models. If a +specific model is given in the URL the versions portion of the URL is +optional; if not provided Triton will return statistics for all +versions of the specified model. + +``` +GET v2/models[/${MODEL_NAME}[/versions/${MODEL_VERSION}]]/stats +``` + +### Statistics Response JSON Object + +A successful statistics request is indicated by a 200 HTTP status +code. The response object, identified as `$stats_model_response`, is +returned in the HTTP body for every successful statistics request. + +``` +$stats_model_response = +{ + "model_stats" : [ $model_stat, ... ] +} +``` + +Each `$model_stat` object gives the statistics for a specific model and +version. The `$version` field is optional for servers that do not +support versions. + +``` +$model_stat = +{ + "name" : $string, + "version" : $string #optional, + "last_inference" : $number, + "inference_count" : $number, + "execution_count" : $number, + "inference_stats" : $inference_stats, + "response_stats" : { $string : $response_stats, ... }, + "batch_stats" : [ $batch_stats, ... ], + "memory_usage" : [ $memory_usage, ...] +} +``` + +- "name" : The name of the model. + +- "version" : The version of the model. + +- "last_inference" : The timestamp of the last inference request made + for this model, as milliseconds since the epoch. + +- "inference_count" : The cumulative count of successful inference + requests made for this model. Each inference in a batched request is + counted as an individual inference. For example, if a client sends a + single inference request with batch size 64, "inference_count" will + be incremented by 64. Similarly, if a clients sends 64 individual + requests each with batch size 1, "inference_count" will be + incremented by 64. The "inference_count" value DOES NOT include cache hits. + +- "execution_count" : The cumulative count of the number of successful + inference executions performed for the model. When dynamic batching + is enabled, a single model execution can perform inferencing for + more than one inference request. For example, if a clients sends 64 + individual requests each with batch size 1 and the dynamic batcher + batches them into a single large batch for model execution then + "execution_count" will be incremented by 1. If, on the other hand, + the dynamic batcher is not enabled for that each of the 64 + individual requests is executed independently, then + "execution_count" will be incremented by 64. The "execution_count" value + DOES NOT include cache hits. + +- "inference_stats" : The aggregate statistics for the + model. So, for example, "inference_stats":"success" indicates the number of + successful inference requests for the model. + +- "response_stats" : The aggregate response statistics for the model. For + example, { "key" : { "response_stats" : "success" } } indicates the aggregate + statistics of successful responses at "key" for the model, where "key" + identifies each response generated by the model across different requests. For + example, given a model that generates three responses, the keys can be "0", + "1" and "2" identifying the three responses in order. + +- "batch_stats" : The aggregate statistics for each different batch + size that is executed in the model. The batch statistics indicate + how many actual model executions were performed and show differences + due to different batch size (for example, larger batches typically + take longer to compute). + +- "memory_usage" : The memory usage detected during model loading, which may be + used to estimate the memory to be released once the model is unloaded. Note + that the estimation is inferenced by the profiling tools and framework's + memory schema, therefore it is advised to perform experiments to understand + the scenario that the reported memory usage can be relied on. As a starting + point, the GPU memory usage for models in ONNX Runtime backend and TensorRT + backend is usually aligned. + +``` +$inference_stats = +{ + "success" : $duration_stat, + "fail" : $duration_stat, + "queue" : $duration_stat, + "compute_input" : $duration_stat, + "compute_infer" : $duration_stat, + "compute_output" : $duration_stat, + "cache_hit": $duration_stat, + "cache_miss": $duration_stat +} +``` + +- “success†: The count and cumulative duration for all successful + inference requests. The "success" count and cumulative duration includes + cache hits. + +- “fail†: The count and cumulative duration for all failed inference + requests. + +- “queue†: The count and cumulative duration that inference requests + wait in scheduling or other queues. The "queue" count and cumulative + duration includes cache hits. + +- “compute_input†: The count and cumulative duration to prepare input + tensor data as required by the model framework / backend. For + example, this duration should include the time to copy input tensor + data to the GPU. The "compute_input" count and cumulative duration DO NOT + include cache hits. + +- “compute_infer†: The count and cumulative duration to execute the + model. The "compute_infer" count and cumulative duration DO NOT include + cache hits. + +- “compute_output†: The count and cumulative duration to extract + output tensor data produced by the model framework / backend. For + example, this duration should include the time to copy output tensor + data from the GPU. The "compute_output" count and cumulative duration DO NOT + include cache hits. + +- "cache_hit" : The count of response cache hits and cumulative duration to + lookup and extract output tensor data from the Response Cache on a cache hit. + For example, this duration should include the time to copy output tensor data + from the Response Cache to the response object. + +- "cache_miss" : The count of response cache misses and cumulative duration to + lookup and insert output tensor data to the Response Cache on a cache miss. + For example, this duration should include the time to copy output tensor data + from the response object to the Response Cache. + + +``` +$response_stats = +{ + "compute_infer" : $duration_stat, + "compute_output" : $duration_stat, + "success" : $duration_stat, + "fail" : $duration_stat, + "empty_response" : $duration_stat, + "cancel" : $duration_stat +} +``` + +- "compute_infer" : The count and cumulative duration to compute a response. +- "compute_output" : The count and cumulative duration to extract the output + tensor of a computed response. +- "success" : The count and cumulative duration of a success inference. The + duration is the sum of infer and output durations. +- "fail" : The count and cumulative duration of a fail inference. The duration + is the sum of infer and output durations. +- "empty_response" : The count and cumulative duration of an inference with an + empty / no response. The duration is infer durations. +- "cancel" : The count and cumulative duration of a inference cancellation. The + duration is for cleaning up resources held by cancelled inference requests. + + +``` +$batch_stats = +{ + "batch_size" : $number, + "compute_input" : $duration_stat, + "compute_infer" : $duration_stat, + "compute_output" : $duration_stat +} +``` + +- "batch_size" : The size of the batch. + +- "count" : The number of times the batch size was executed on the + model. A single model execution performs inferencing for the entire + request batch and can perform inferencing for multiple requests if + dynamic batching is enabled. + +- “compute_input†: The count and cumulative duration to prepare input + tensor data as required by the model framework / backend with the + given batch size. For example, this duration should include the time + to copy input tensor data to the GPU. + +- “compute_infer†: The count and cumulative duration to execute the + model with the given batch size. + +- “compute_output†: The count and cumulative duration to extract + output tensor data produced by the model framework / backend with + the given batch size. For example, this duration should include the + time to copy output tensor data from the GPU. + +The `$duration_stat` object reports a count and a total time. This +format can be sampled to determine not only long-running averages but +also incremental averages between sample points. + +``` +$duration_stat = +{ + "count" : $number, + "ns" : $number +} +``` + +- "count" : The number of times the statistic was collected. + +- “ns†: The total duration for the statistic in nanoseconds. + +``` +$memory_usage = +{ + "type" : $string, + "id" : $number, + "byte_size" : $number +} +``` + +- "type" : The type of memory, the value can be "CPU", "CPU_PINNED", "GPU". + +- "id" : The id of the memory, typically used with "type" to identify + a device that hosts the memory. + +- "byte_size" : The byte size of the memory. + +### Statistics Response JSON Error Object + +A failed statistics request will be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$repository_statistics_error_response` object. + +``` +$repository_statistics_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +## GRPC + +For the statistics extension Triton implements the following API: + +``` +service GRPCInferenceService +{ + … + + // Get the cumulative statistics for a model and version. + rpc ModelStatistics(ModelStatisticsRequest) + returns (ModelStatisticsResponse) {} +} +``` + +The ModelStatistics API returns model statistics. Errors are indicated +by the google.rpc.Status returned for the request. The OK code +indicates success and other codes indicate failure. The request and +response messages for ModelStatistics are: + +``` +message ModelStatisticsRequest +{ + // The name of the model. If not given returns statistics for all + // models. + string name = 1; + + // The version of the model. If not given returns statistics for + // all model versions. + string version = 2; +} + +message ModelStatisticsResponse +{ + // Statistics for each requested model. + repeated ModelStatistics model_stats = 1; +} +``` + +The statistics messages are: + +``` +// Statistic recording a cumulative duration metric. +message StatisticDuration +{ + // Cumulative number of times this metric occurred. + uint64 count = 1; + + // Total collected duration of this metric in nanoseconds. + uint64 ns = 2; +} + +// Statistics for a specific model and version. +message ModelStatistics +{ + // The name of the model. + string name = 1; + + // The version of the model. + string version = 2; + + // The timestamp of the last inference request made for this model, + // as milliseconds since the epoch. + uint64 last_inference = 3; + + // The cumulative count of successful inference requests made for this + // model. Each inference in a batched request is counted as an + // individual inference. For example, if a client sends a single + // inference request with batch size 64, "inference_count" will be + // incremented by 64. Similarly, if a clients sends 64 individual + // requests each with batch size 1, "inference_count" will be + // incremented by 64. The "inference_count" value DOES NOT include cache hits. + uint64 inference_count = 4; + + // The cumulative count of the number of successful inference executions + // performed for the model. When dynamic batching is enabled, a single + // model execution can perform inferencing for more than one inference + // request. For example, if a clients sends 64 individual requests each + // with batch size 1 and the dynamic batcher batches them into a single + // large batch for model execution then "execution_count" will be + // incremented by 1. If, on the other hand, the dynamic batcher is not + // enabled for that each of the 64 individual requests is executed + // independently, then "execution_count" will be incremented by 64. + // The "execution_count" value DOES NOT include cache hits. + uint64 execution_count = 5; + + // The aggregate statistics for the model. + InferStatistics inference_stats = 6; + + // The aggregate statistics for each different batch size that is + // executed in the model. The batch statistics indicate how many actual + // model executions were performed and show differences due to different + // batch size (for example, larger batches typically take longer to compute). + repeated InferBatchStatistics batch_stats = 7; + + // The memory usage detected during model loading, which may be + // used to estimate the memory to be released once the model is unloaded. Note + // that the estimation is inferenced by the profiling tools and framework's + // memory schema, therefore it is advised to perform experiments to understand + // the scenario that the reported memory usage can be relied on. As a starting + // point, the GPU memory usage for models in ONNX Runtime backend and TensorRT + // backend is usually aligned. + repeated MemoryUsage memory_usage = 8; + + // The key and value pairs for all decoupled responses statistics. The key is + // a string identifying a set of response statistics aggregated together (i.e. + // index of the response sent). The value is the aggregated response + // statistics. + map response_stats = 9; +} + +// Inference statistics. +message InferStatistics +{ + // Cumulative count and duration for successful inference + // request. The "success" count and cumulative duration includes + // cache hits. + StatisticDuration success = 1; + + // Cumulative count and duration for failed inference + // request. + StatisticDuration fail = 2; + + // The count and cumulative duration that inference requests wait in + // scheduling or other queues. The "queue" count and cumulative + // duration includes cache hits. + StatisticDuration queue = 3; + + // The count and cumulative duration to prepare input tensor data as + // required by the model framework / backend. For example, this duration + // should include the time to copy input tensor data to the GPU. + // The "compute_input" count and cumulative duration do not account for + // requests that were a cache hit. See the "cache_hit" field for more + // info. + StatisticDuration compute_input = 4; + + // The count and cumulative duration to execute the model. + // The "compute_infer" count and cumulative duration do not account for + // requests that were a cache hit. See the "cache_hit" field for more + // info. + StatisticDuration compute_infer = 5; + + // The count and cumulative duration to extract output tensor data + // produced by the model framework / backend. For example, this duration + // should include the time to copy output tensor data from the GPU. + // The "compute_output" count and cumulative duration do not account for + // requests that were a cache hit. See the "cache_hit" field for more + // info. + StatisticDuration compute_output = 6; + + // The count of response cache hits and cumulative duration to lookup + // and extract output tensor data from the Response Cache on a cache + // hit. For example, this duration should include the time to copy + // output tensor data from the Response Cache to the response object. + // On cache hits, triton does not need to go to the model/backend + // for the output tensor data, so the "compute_input", "compute_infer", + // and "compute_output" fields are not updated. Assuming the response + // cache is enabled for a given model, a cache hit occurs for a + // request to that model when the request metadata (model name, + // model version, model inputs) hashes to an existing entry in the + // cache. On a cache miss, the request hash and response output tensor + // data is added to the cache. See response cache docs for more info: + // https://github.com/triton-inference-server/server/blob/main/docs/response_cache.md + StatisticDuration cache_hit = 7; + + // The count of response cache misses and cumulative duration to lookup + // and insert output tensor data from the computed response to the cache + // For example, this duration should include the time to copy + // output tensor data from the response object to the Response Cache. + // Assuming the response cache is enabled for a given model, a cache + // miss occurs for a request to that model when the request metadata + // does NOT hash to an existing entry in the cache. See the response + // cache docs for more info: + // https://github.com/triton-inference-server/server/blob/main/docs/response_cache.md + StatisticDuration cache_miss = 8; +} + +// Statistics per decoupled response. +message InferResponseStatistics +{ + // The count and cumulative duration to compute a response. + StatisticDuration compute_infer = 1; + + // The count and cumulative duration to extract the output tensors of a + // response. + StatisticDuration compute_output = 2; + + // The count and cumulative duration for successful responses. + StatisticDuration success = 3; + + // The count and cumulative duration for failed responses. + StatisticDuration fail = 4; + + // The count and cumulative duration for empty responses. + StatisticDuration empty_response = 5; +} + +// Inference batch statistics. +message InferBatchStatistics +{ + // The size of the batch. + uint64 batch_size = 1; + + // The count and cumulative duration to prepare input tensor data as + // required by the model framework / backend with the given batch size. + // For example, this duration should include the time to copy input + // tensor data to the GPU. + StatisticDuration compute_input = 2; + + // The count and cumulative duration to execute the model with the given + // batch size. + StatisticDuration compute_infer = 3; + + // The count and cumulative duration to extract output tensor data + // produced by the model framework / backend with the given batch size. + // For example, this duration should include the time to copy output + // tensor data from the GPU. + StatisticDuration compute_output = 4; +} + +// Memory usage. +message MemoryUsage +{ + // The type of memory, the value can be "CPU", "CPU_PINNED", "GPU". + string type = 1; + + // The id of the memory, typically used with "type" to identify + // a device that hosts the memory. + int64_t id = 2; + + // The byte size of the memory. + uint64_t byte_size = 3; +} +``` diff --git a/docs/protocol/extension_trace.md b/docs/protocol/extension_trace.md new file mode 100644 index 0000000000..6472e1db24 --- /dev/null +++ b/docs/protocol/extension_trace.md @@ -0,0 +1,197 @@ + + +# Trace Extension + +This document describes Triton's trace extension. The trace extension enables +the client to configure the trace settings during a Triton run. Because this +extension is supported, Triton reports “trace†in the extensions field of +its Server Metadata. + +## HTTP/REST + +In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, +`$object` and `$array` refer to the fundamental JSON types. `#optional` +indicates an optional JSON field. + +Triton exposes the trace endpoint at the following URL. The client may use +HTTP GET request to retrieve the current trace setting. A HTTP POST request +will modify the trace setting, and the endpoint will return the updated trace +setting on success or an error in the case of failure. Optional model name +can be provided to get or to set the trace settings for specific model. + +``` +GET v2[/models/${MODEL_NAME}]/trace/setting + +POST v2[/models/${MODEL_NAME}]/trace/setting +``` + +### Trace Setting Response JSON Object + +A successful trace setting request is indicated by a 200 HTTP status +code. The response object, identified as `$trace_setting_response`, is +returned in the HTTP body for every successful trace setting request. + +``` +$trace_setting_response = +{ + $trace_setting, ... +} + +$trace_setting = $string : $string | [ $string, ...] +``` + +Each `$trace_setting` JSON describes a “nameâ€/â€value†pair, where the “name†is +the name of the trace setting and the “value†is a `$string representation` of the +setting value, or an array of `$string` for some settings. Currently the following +trace settings are defined: + +- "trace_file" : the file where the trace output will be saved. If +"log_frequency" is set, this will be the prefix of the files to save the +trace output, resulting files in name `"${trace_file}.0", "${trace_file}.1", ...`, +see trace setting "log_frequency" below for detail. +- "trace_level" : the trace level. "OFF" to disable tracing, +"TIMESTAMPS" to trace timestamps, "TENSORS" to trace tensors. +This value is an array of string where user may specify multiple levels to +trace multiple information. +- "trace_rate" : the trace sampling rate. The value represents how many requests +will one trace be sampled from. For example, if the trace rate is "1000", +1 trace will be sampled for every 1000 requests. +- "trace_count" : the number of remaining traces to be sampled. Once the value +becomes "0", no more traces will be sampled for the trace setting, and the +collected traces will be written to indexed trace file in the format described +in "log_frequency", regardless of the "log_frequency" status. +If the value is "-1", the number of traces to be sampled will not be limited. +- "log_frequency" : the frequency that Triton will log the +trace output to the files. If the value is "0", Triton will only log +the trace output to `${trace_file}` when shutting down. Otherwise, Triton will log +the trace output to `${trace_file}.${idx}` when it collects +the specified number of traces. For example, if the log frequency is "100", +when Triton collects the 100-th trace, it logs the traces to file +`"${trace_file}.0"`, and when it collects the 200-th trace, it logs the 101-th to +the 200-th traces to file `"${trace_file}.1"`. Note that the file index will be +reset to 0 when "trace_file" setting is updated. + + +### Trace Setting Response JSON Error Object + +A failed trace setting request will be indicated by an HTTP error status +(typically 400). The HTTP body must contain the +`$trace_setting_error_response` object. + +``` +$trace_setting_error_response = +{ + "error": $string +} +``` + +- “error†: The descriptive message for the error. + +#### Trace Setting Request JSON Object + +A trace setting request is made with a HTTP POST to +the trace endpoint. In the corresponding response the HTTP body contains the +response JSON. A successful request is indicated by a 200 HTTP status code. + +The request object, identified as `$trace_setting_request` must be provided in the HTTP +body. + +``` +$trace_setting_request = +{ + $trace_setting, ... +} +``` + +The `$trace_setting` JSON is defined in +[Trace Setting Response JSON Object](#trace-setting-response-json-object), only the specified +settings will be updated. In addition to the values mentioned in response JSON +object, JSON null value may be used to remove the specification of +the trace setting. In such case, the current global setting will be used. +Similarly, if this is the first request to initialize a model trace settings, +for the trace settings that are not specified in the request, the current global +setting will be used. + +## GRPC + +For the trace extension Triton implements the following API: + +``` +service GRPCInferenceService +{ + … + + // Update and get the trace setting of the Triton server. + rpc TraceSetting(TraceSettingRequest) + returns (TraceSettingResponse) {} +} +``` + +The Trace Setting API returns the latest trace settings. Errors are indicated +by the google.rpc.Status returned for the request. The OK code +indicates success and other codes indicate failure. The request and +response messages for Trace Setting are: + +``` +message TraceSettingRequest +{ + // The values to be associated with a trace setting. + // If no value is provided, the setting will be clear and + // the global setting value will be used. + message SettingValue + { + repeated string value = 1; + } + + // The new setting values to be updated, + // settings that are not specified will remain unchanged. + map settings = 1; + + // The name of the model to apply the new trace settings. + // If not given, the new settings will be applied globally. + string model_name = 2; +} + +message TraceSettingResponse +{ + message SettingValue + { + repeated string value = 1; + } + + // The latest trace settings. + map settings = 1; +} +``` + +The trace settings are mentioned in +[Trace Setting Response JSON Object](#trace-setting-response-json-object). +Note that if this is the first request to initialize +a model trace settings, for the trace settings that are not specified +in the request, the value will be copied from the current global settings. diff --git a/docs/python_api.rst b/docs/python_api.rst deleted file mode 100644 index 237a83f8d4..0000000000 --- a/docs/python_api.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Python API -========== - -Client ------- - -.. automodule:: tensorrtserver.api - :members: diff --git a/docs/quickstart.rst b/docs/quickstart.rst deleted file mode 100644 index c53cc06dcc..0000000000 --- a/docs/quickstart.rst +++ /dev/null @@ -1,52 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Quickstart -========== - -To quickly get the TensorRT Inference Server (TRTIS) up and running -follow these steps. After you've seen TRTIS in action you can revisit -the rest of the User Guide to learn more about its features. - -First, follow the instructions in -:ref:`section-installing-prebuilt-containers` to install the TRTIS -container. - -Next, use the :ref:`section-example-model-repository` section to -create an example model repository containing a couple of models that -you can serve with TRTIS. - -Now that you have a model repository, follow the instructions in -:ref:`section-running-the-inference-server` to start TRTIS. Use the -server's *Status* endpoint to :ref:`make sure the server and the -models are ready for -inferencing`. - -Finally, -:ref:`build` and -:ref:`run` the example -image-client application to perform image classification using TRTIS. diff --git a/docs/run.rst b/docs/run.rst deleted file mode 100644 index 720603d453..0000000000 --- a/docs/run.rst +++ /dev/null @@ -1,127 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Running the Server -================== - -.. _section-example-model-repository: - -Example Model Repository ------------------------- - -Before running the TensorRT Inference Server, you must first set up a -model repository containing the models that TRTIS will make available -for inferencing. - -An example model repository containing a Caffe2 ResNet50, a TensorFlow -Inception model, and a simple TensorFlow GraphDef model (used by the -:ref:`simple_client example `) are provided in the -`docs/examples/model_repository -`_ -directory. Before using the example model repository you must fetch -any missing model definition files from their public model zoos:: - - $ cd docs/examples - $ ./fetch_models.sh - -.. _section-running-the-inference-server: - -Running The Inference Server ----------------------------- - -Before running TRTIS, you must first set up a model repository -containing the models that TRTIS will make available -for inferencing. Section :ref:`section-model-repository` describes how -to create your own model repository. You can also use -:ref:`section-example-model-repository` to set up an example model -repository. - -Assuming the sample model repository is available in -/path/to/model/repository, the following command runs the container -you pulled from NGC or built locally:: - - $ nvidia-docker run --rm --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p8000:8000 -p8001:8001 -p8002:8002 -v/path/to/model/repository:/models trtserver --model-store=/models - -Where ** will be something like -**nvcr.io/nvidia/tensorrtserver:18.11-py3** if you pulled the -container from the NGC register, or **tensorrtserver** if you -:ref:`built it from source `. - -The nvidia-docker -v option maps /path/to/model/repository on the host -into the container at /models, and the -\\-model-store option to TRTIS -is used to point to /models as the model repository. - -The -p flags expose the container ports where TRTIS listens for HTTP -requests (port 8000), listens for GRPC requests (port 8001), and -reports Prometheus metrics (port 8002). - -The -\\-shm-size and -\\-ulimit flags are recommended to improve TRTIS -performance. For -\\-shm-size the minimum recommended size is 1g but -larger sizes may be necessary depending on the number and size of -models being served. - -For more information on the Prometheus metrics provided by the -inference server see :ref:`section-metrics`. - -.. _section-checking-inference-server-status: - -Checking Inference Server Status -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The simplest way to verify that TRTIS is running correctly is to use -the Status API to query the server’s status. From the host system use -*curl* to access the HTTP endpoint to request server status. The -response is protobuf text showing the status for the server and for -each model being served, for example:: - - $ curl localhost:8000/api/status - id: "inference:0" - version: "0.6.0" - uptime_ns: 23322988571 - model_status { - key: "resnet50_netdef" - value { - config { - name: "resnet50_netdef" - platform: "caffe2_netdef" - } - ... - version_status { - key: 1 - value { - ready_state: MODEL_READY - } - } - } - } - ready_state: SERVER_READY - -This status shows configuration information as well as indicating that -version 1 of the resnet50_netdef model is MODEL_READY. This means that -TRTIS is ready to accept inferencing requests for version 1 of that -model. A model version ready_state will show up as MODEL_UNAVAILABLE -if the model failed to load for some reason. diff --git a/docs/templates/layout.html b/docs/templates/layout.html deleted file mode 100644 index 4700514819..0000000000 --- a/docs/templates/layout.html +++ /dev/null @@ -1,78 +0,0 @@ - -{% extends "!layout.html" %} - {% block sidebartitle %} {{ super() }} - - - {% endblock %} - - {% block footer %} {{ super() }} - - - {% endblock %} diff --git a/docs/test.rst b/docs/test.rst deleted file mode 100644 index a6bbfcc509..0000000000 --- a/docs/test.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. - # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in the - # documentation and/or other materials provided with the distribution. - # * Neither the name of NVIDIA CORPORATION nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Testing -======= - -Currently there is no CI testing enabled for the open-source version -of the TensorRT Inference Server. We will enable CI testing in a -future update. - -There is a set of tests in the qa/ directory that can be run manually -to provide some testing. Before running these tests you must first -generate a test model repository containing the models needed by the -tests. - -Generate QA Model Repository ----------------------------- - -The QA model repository contains some simple models that are used to -verify the correctness of TRTIS. To generate the QA model repository:: - - $ cd qa/common - $ ./gen_qa_model_repository - -This will generate the model repository in /tmp/qa_model_repository. -The TensorRT models will be created for the GPU on the system that -CUDA considers device 0 (zero). If you have multiple GPUs on your -system see the documentation in the script for how to target a -specific GPU. - -Build QA Container ------------------- - -Next you need to build a QA version of the TRTIS container. This -container will contain TRTIS, the QA tests, and all the dependencies -needed to run the QA tests. You must first build the -tensorrtserver_build and tensorrtserver containers as described in -:ref:`section-building-the-server` and then build the QA container:: - - $ docker build -t tensorrtserver_qa -f Dockerfile.QA . - -Run QA Container ----------------- - -Now run the QA container and mount the QA model repository into the -container so the tests will be able to access it:: - - $ nvidia-docker run -it --rm -v/tmp/qa_model_repository:/models tensorrtserver_qa - -Within the container the QA tests are in /opt/tensorrtserver/qa. To run a test:: - - $ cd - $ ./test.sh diff --git a/docs/user_guide/architecture.md b/docs/user_guide/architecture.md new file mode 100644 index 0000000000..1dc87228d4 --- /dev/null +++ b/docs/user_guide/architecture.md @@ -0,0 +1,820 @@ + + +# Triton Architecture + +The following figure shows the Triton Inference Server high-level +architecture. The [model repository](model_repository.md) is a +file-system based repository of the models that Triton will make +available for inferencing. Inference requests arrive at the server via +either [HTTP/REST or GRPC](../customization_guide/inference_protocols.md) or by the [C +API](../customization_guide/inference_protocols.md) and are then routed to the appropriate per-model +scheduler. Triton implements [multiple scheduling and batching +algorithms](#models-and-schedulers) that can be configured on a +model-by-model basis. Each model's scheduler optionally performs +batching of inference requests and then passes the requests to the +[backend](https://github.com/triton-inference-server/backend/blob/main/README.md) +corresponding to the model type. The backend performs inferencing +using the inputs provided in the batched requests to produce the +requested outputs. The outputs are then returned. + +Triton supports a [backend C +API](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api) +that allows Triton to be extended with new functionality such as +custom pre- and post-processing operations or even a new deep-learning +framework. + +The models being served by Triton can be queried and controlled by a +dedicated [model management API](model_management.md) that is +available by HTTP/REST or GRPC protocol, or by the C API. + +Readiness and liveness health endpoints and utilization, throughput +and latency metrics ease the integration of Triton into deployment +framework such as Kubernetes. + +![Triton Architecture Diagram](images/arch.jpg) + +## Concurrent Model Execution + +The Triton architecture allows multiple models and/or multiple +instances of the same model to execute in parallel on the same +system. The system may have zero, one, or many GPUs. The following +figure shows an example with two models; model0 and model1. Assuming +Triton is not currently processing any request, when two requests +arrive simultaneously, one for each model, Triton immediately +schedules both of them onto the GPU and the GPU’s hardware scheduler +begins working on both computations in parallel. Models executing on +the system's CPU are handled similarly by Triton except that the +scheduling of the CPU threads execution each model is handled by the +system's OS. + +![Triton Mult-Model Execution Diagram](images/multi_model_exec.png) + +By default, if multiple requests for the same model arrive at the same +time, Triton will serialize their execution by scheduling only one at +a time on the GPU, as shown in the following figure. + +![Triton Mult-Model Serial Execution +Diagram](images/multi_model_serial_exec.png) + +Triton provides a [model configuration option called +instance-group](model_configuration.md#instance-groups) that allows +each model to specify how many parallel executions of that model +should be allowed. Each such enabled parallel execution is referred to +as an *instance*. By default, Triton gives each model a single +instance for each available GPU in the system. By +using the instance_group field in the model configuration, the number +of execution instances for a model can +be changed. The following figure shows model execution when model1 +is configured to allow three instances. As shown in the figure, the +first three model1 inference requests are immediately executed in +parallel. The fourth model1 inference request must wait until one of +the first three executions completes before beginning. + +![Triton Mult-Model Parallel Execution +Diagram](images/multi_model_parallel_exec.png) + +## Models And Schedulers + +Triton supports multiple scheduling and batching algorithms that can +be selected independently for each model. This section describes +*stateless*, *stateful* and *ensemble* models and how Triton provides +schedulers to support those model types. For a given model, the +selection and configuration of the scheduler is done with the [model's +configuration file](model_configuration.md). + +### Stateless Models + +With respect to Triton's schedulers, a *stateless* model does not +maintain state between inference requests. Each inference performed on +a stateless model is independent of all other inferences using that +model. + +Examples of stateless models are CNNs such as image classification and +object detection. The [default +scheduler](model_configuration.md#default-scheduler) or [dynamic +batcher](model_configuration.md#dynamic-batcher) can be used as the +scheduler for these stateless models. + +RNNs and similar models which do have internal memory can be stateless +as long as the state they maintain does not span inference +requests. For example, an RNN that iterates over all elements in a +batch is considered stateless by Triton if the internal state is not +carried between batches of inference requests. The [default +scheduler](model_configuration.md#default-scheduler) can be used for +these stateless models. The [dynamic +batcher](model_configuration.md#dynamic-batcher) cannot be used since +the model is typically not expecting the batch to represent multiple +inference requests. + +### Stateful Models + +With respect to Triton's schedulers, a *stateful* model does maintain +state between inference requests. The model is expecting multiple +inference requests that together form a sequence of inferences that +must be routed to the same model instance so that the state being +maintained by the model is correctly updated. Moreover, the model may +require that Triton provide *control* signals indicating, for example, +the start and end of the sequence. + +The [sequence batcher](model_configuration.md#sequence-batcher) must +be used for these stateful models. As explained below, the sequence +batcher ensures that all inference requests in a sequence get routed +to the same model instance so that the model can maintain state +correctly. The sequence batcher also communicates with the model to +indicate when a sequence is starting, when a sequence is ending, when +a sequence has an inference request ready for execution, and the +*correlation ID* of the sequence. + +When making inference requests for a stateful model, the client +application must provide the same correlation ID to all requests in a +sequence, and must also mark the start and end of the sequence. The +correlation ID allows Triton to identify that the requests belong to +the same sequence. + +#### Control Inputs + +For a stateful model to operate correctly with the sequence batcher, +the model must typically accept one or more *control* input tensors +that Triton uses to communicate with the model. The +*ModelSequenceBatching::Control* section of the [model +configuration](model_configuration.md) indicates how the model exposes +the tensors that the sequence batcher should use for these +controls. All controls are optional. Below is portion of a model +configuration that shows an example configuration for all the +available control signals. + +``` +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "CORRID" + control [ + { + kind: CONTROL_SEQUENCE_CORRID + data_type: TYPE_UINT64 + } + ] + } + ] +} +``` + +* **Start**: The start input tensor is specified using + CONTROL_SEQUENCE_START in the configuration. The example + configuration indicates that the model has an input tensor called + START with a 32-bit floating point data-type. The sequence batcher + will define this tensor when executing an inference on the + model. The START tensor must be 1-dimensional with size equal to the + batch-size. Each element in the tensor indicates if the sequence in + the corresponding batch slot is starting or not. In the example + configuration, fp32_false_true indicates that a sequence start is + indicated by tensor element equal to 1, and non-start is indicated + by tensor element equal to 0. + +* **End**: The end input tensor is specified using + CONTROL_SEQUENCE_END in the configuration. The example configuration + indicates that the model has an input tensor called END with a + 32-bit floating point data-type. The sequence batcher will define + this tensor when executing an inference on the model. The END tensor + must be 1-dimensional with size equal to the batch-size. Each + element in the tensor indicates if the sequence in the corresponding + batch slot is ending or not. In the example configuration, + fp32_false_true indicates that a sequence end is indicated by tensor + element equal to 1, and non-end is indicated by tensor element equal + to 0. + +* **Ready**: The ready input tensor is specified using + CONTROL_SEQUENCE_READY in the configuration. The example + configuration indicates that the model has an input tensor called + READY with a 32-bit floating point data-type. The sequence batcher + will define this tensor when executing an inference on the + model. The READY tensor must be 1-dimensional with size equal to the + batch-size. Each element in the tensor indicates if the sequence in + the corresponding batch slot has an inference request ready for + inference. In the example configuration, fp32_false_true indicates + that a sequence ready is indicated by tensor element equal to 1, and + non-ready is indicated by tensor element equal to 0. + +* **Correlation ID**: The correlation ID input tensor is specified + using CONTROL_SEQUENCE_CORRID in the configuration. The example + configuration indicates that the model has an input tensor called + CORRID with a unsigned 64-bit integer data-type. The sequence + batcher will define this tensor when executing an inference on the + model. The CORRID tensor must be 1-dimensional with size equal to + the batch-size. Each element in the tensor indicates the correlation + ID of the sequence in the corresponding batch slot. + +#### Implicit State Management + +Implicit state management allows a stateful model to store its state inside +Triton. When using implicit state, the stateful model does not need to store +the state required for inference inside the model. + +Below is a portion of the model configuration that indicates the model +is using implicit state. + +``` +sequence_batching { + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT32 + dims: [ -1 ] + } + ] +} +``` + +The *state* section in the sequence_batching setting is used to indicate that +the model is using implicit state. The *input_name* field specifies the name of +the input tensor that will contain the input state. The *output_name* field +describes the name of the output tensor produced by the model that contains +output state. The output state provided by the model in the *ith* +request in the sequence will be used as the input state in the +*i+1th* request. The *dims* field specifies the dimensions of the +state tensors. When the *dims* field contains variable-sized dimensions, the +shape of the input state and output state does not have to match. + +For debugging purposes, the client can request the output state. In order to +allow the client to request the output state, the +[*output* section of the model configuration](./model_configuration.md#inputs-and-outputs) +must list the output state as one of the model outputs. Note that requesting the +output state from the client can increase the request latency because of the +additional tensors that have to be transferred. + +Implicit state management requires backend support. Currently, only +[onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend) +[tensorrt_backend](https://github.com/triton-inference-server/tensorrt_backend), +and [pytorch_backend](https://github.com/triton-inference-server/pytorch_backend) +support implicit state. + +##### State Initialization + +By default, the starting request in the sequence contains uninitialized data for +the input state. The model can use the start flag in the request to detect the +beginning of a new sequence and initialize the model state by providing the +initial state in the model output. If the *dims* section in the *state* +description of the model contains variable-sized dimensions, Triton will use *1* +for every variable-sized dimension for the starting request. For other +non-starting requests in the sequence, the input state is the output state of +the previous request in the sequence. For an example ONNX model that uses +implicit state you can refer to this onnx model generated from the +`create_onnx_modelfile_wo_initial_state()` +[from this generation script](https://github.com/triton-inference-server/server/blob/main/qa/common/gen_qa_implicit_models.py). +This is a simple accumulator model that stores the partial sum of the requests +in a sequence in Triton using implicit state. For state initialization, if the +request is starting, the model sets the "OUTPUT\_STATE" to be equal to the +"INPUT" tensor. For non-starting requests, it sets the "OUTPUT\_STATE" tensor +to the sum of "INPUT" and "INPUT\_STATE" tensors. + +In addition to the default state initialization discussed above, Triton provides +two other mechanisms for initializing state. + +###### Initializing State from Zero. + +Below is an example of initializing state from zero. + +``` +sequence_batching { + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT32 + dims: [ -1 ] + initial_state: { + data_type: TYPE_INT32 + dims: [ 1 ] + zero_data: true + name: "initial state" + } + } + ] +} +``` + +Note that in the example above variable dimensions in the state description are +converted to fixed size dimensions. + +###### Initializing State from File + +For initializing state from file, you need to create a directory named +"initial\_state" under the model directory. The file that contains the initial +state under this directory needs to be provided in the *data_file* field. +The data stored in this file will be used in row-major order as the initial +state. Below is an example state description initializing state from file. + +``` +sequence_batching { + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT32 + dims: [ -1 ] + initial_state: { + data_type: TYPE_INT32 + dims: [ 1 ] + data_file: "initial_state_data" + name: "initial state" + } + } + ] +} +``` + +#### Scheduling Strategies + +The sequence batcher can employ one of two scheduling strategies when +deciding how to batch the sequences that are routed to the same model +instance. These strategies are [direct](#direct) and [oldest](#oldest). + +##### Direct + +With the Direct scheduling strategy the sequence batcher ensures not +only that all inference requests in a sequence are routed to the same +model instance, but also that each sequence is routed to a dedicated +batch slot within the model instance. This strategy is required when +the model maintains state for each batch slot, and is expecting all +inference requests for a given sequence to be routed to the same slot +so that the state is correctly updated. + +As an example of the sequence batcher using the Direct scheduling +strategy, assume a TensorRT stateful model that has the following +model configuration. + +``` +name: "direct_stateful_model" +platform: "tensorrt_plan" +max_batch_size: 2 +sequence_batching { + max_sequence_idle_microseconds: 5000000 + direct { } + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ 100, 100 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ 10 ] + } +] +instance_group [ + { + count: 2 + } +] +``` + +The sequence_batching section indicates that the model should use the +sequence batcher and the Direct scheduling strategy. In this example +the model only requires a *start* and *ready* control input from the +sequence batcher so only those controls are listed. The instance_group +indicates two instances of the model should be instantiated and +max_batch_size indicates that each of those instances should perform +batch-size 2 inferences. The following figure shows a representation +of the sequence batcher and the inference resources specified by this +configuration. + +![Sequence Batching Example](images/sequence_example0.png) + +Each model instance is maintaining state for each batch slot, and is +expecting all inference requests for a given sequence to be routed to +the same slot so that the state is correctly updated. For this example +that means that Triton can simultaneously perform inference for up to +four sequences. + +Using the Direct scheduling strategy, the sequence batcher: + +* Recognizes when an inference request starts a new sequence and + allocates a batch slot for that sequence. If no batch slot is + available for the new sequence, Triton places the inference request + in a backlog. + +* Recognizes when an inference request is part of a sequence that has + an allocated batch slot and routes the request to that slot. + +* Recognizes when an inference request is part of a sequence that is + in the backlog and places the request in the backlog. + +* Recognizes when the last inference request in a sequence has been + completed. The batch slot occupied by that sequence is immediately + reallocated to a sequence in the backlog, or freed for a future + sequence if there is no backlog. + +The following figure shows how multiple sequences are scheduled onto +the model instances using the Direct scheduling strategy. On the left +the figure shows several sequences of requests arriving at +Triton. Each sequence could be made up of any number of inference +requests and those individual inference requests could arrive in any +order relative to inference requests in other sequences, except that +the execution order shown on the right assumes that the first +inference request of sequence 0 arrives before any inference request +in sequences 1-5, the first inference request of sequence 1 arrives +before any inference request in sequences 2-5, etc. + +The right of the figure shows how the inference request sequences are +scheduled onto the model instances over time. + +![Sequence Batcher Example](images/sequence_example1.png) + +The following figure shows the sequence batcher uses the control input +tensors to communicate with the model. The figure shows two sequences +assigned to the two batch slots in a model instance. Inference +requests for each sequence arrive over time. The START and READY rows +show the input tensor values used for each execution of the +model. Over time the following happens: + +* The first request arrives for the sequence in slot0. Assuming the + model instance is not already executing an inference, the sequence + scheduler immediately schedules the model instance to execute + because an inference request is available. + +* This is the first request in the sequence so the corresponding + element in the START tensor is set to 1. There is no request + available in slot1 so the READY tensor shows only slot0 as ready. + +* After the inference completes the sequence scheduler sees that there + are no requests available in any batch slot and so the model + instance sits idle. + +* Next, two inference requests arrive close together in time so that + the sequence scheduler sees them both available in their respective + batch slots. The scheduler immediately schedules the model instance + to perform a batch-size 2 inference and uses START and READY to show + that both slots have an inference request available but that only + slot1 is the start of a new sequence. + +* The processing continues in a similar manner for the other inference + requests. + +![Sequence Batcher Example](images/sequence_example2.png) + +##### Oldest + +With the Oldest scheduling strategy the sequence batcher ensures that +all inference requests in a sequence are routed to the same model +instance and then uses the [dynamic +batcher](model_configuration.md#dynamic-batcher) to batch together +multiple inferences from different sequences into a batch that +inferences together. With this strategy the model must typically use +the CONTROL_SEQUENCE_CORRID control so that it knows which sequence +each inference request in the batch belongs to. The +CONTROL_SEQUENCE_READY control is typically not needed because all +inferences in the batch will always be ready for inference. + +As an example of the sequence batcher using the Oldest scheduling +strategy, assume a stateful model that has the following model +configuration: + +``` +name: "oldest_stateful_model" +platform: "tensorflow_savedmodel" +max_batch_size: 2 +sequence_batching { + max_sequence_idle_microseconds: 5000000 + oldest + { + max_candidate_sequences: 4 + } + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "CORRID" + control [ + { + kind: CONTROL_SEQUENCE_CORRID + data_type: TYPE_UINT64 + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ 100, 100 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ 10 ] + } +] +``` + +The sequence_batching section indicates that the model should use the +sequence batcher and the Oldest scheduling strategy. The Oldest +strategy is configured so that the sequence batcher maintains up to 4 +active candidate sequences from which it prefers to form dynamic +batches of size 2. In this example the model requires a *start*, +*end*, and *correlation ID* control input from the sequence +batcher. The following figure shows a representation of the sequence +batcher and the inference resources specified by this configuration. + +![Sequence Batching Example](images/dyna_sequence_example0.png) + +Using the Oldest scheduling strategy, the sequence batcher: + +* Recognizes when an inference request starts a new sequence and + attempts to find a model instance that has room for a candidate + sequence. If no model instance has room for a new candidate + sequence, Triton places the inference request in a backlog. + +* Recognizes when an inference request is part of a sequence that is + already a candidate sequence in some model instance and routes the + request to that model instance. + +* Recognizes when an inference request is part of a sequence that is + in the backlog and places the request in the backlog. + +* Recognizes when the last inference request in a sequence has been + completed. The model instance immediately removes a sequence from + the backlog and makes it a candidate sequence in the model instance, + or records that the model instance can handle a future sequence if + there is no backlog. + +The following figure shows how multiple sequences are scheduled onto +the model instance specified by the above example configuration. On +the left the figure shows four sequences of requests arriving at +Triton. Each sequence is composed of multiple inference requests as +shown in the figure. The center of the figure shows how the inference +request sequences are batched onto the model instance over time, +assuming that the inference requests for each sequence arrive at the +same rate with sequence A arriving just before B, which arrives just +before C, etc. The Oldest strategy forms a dynamic batch from the +oldest requests but never includes more than one request from a given +sequence in a batch (for example, the last two inferences in sequence +D are not batched together). + +![Sequence Batcher Example](images/dyna_sequence_example1.png) + +### Ensemble Models + +An ensemble model represents a *pipeline* of one or more models and +the connection of input and output tensors between those +models. Ensemble models are intended to be used to encapsulate a +procedure that involves multiple models, such as "data preprocessing +-> inference -> data postprocessing". Using ensemble models for this +purpose can avoid the overhead of transferring intermediate tensors +and minimize the number of requests that must be sent to Triton. + +The ensemble scheduler must be used for ensemble models, regardless of +the scheduler used by the models within the ensemble. With respect to +the ensemble scheduler, an *ensemble* model is not an actual +model. Instead, it specifies the dataflow between models within the +ensemble as *ModelEnsembling::Step* entries in the model +configuration. The scheduler collects the output tensors in each step, +provides them as input tensors for other steps according to the +specification. In spite of that, the ensemble model is still viewed as +a single model from an external view. + +Note that the ensemble models will inherit the characteristics of the +models involved, so the meta-data in the request header must comply +with the models within the ensemble. For instance, if one of the +models is stateful model, then the inference request for the ensemble +model should contain the information mentioned in [Stateful +Models](#stateful-models), which will be provided to the stateful +model by the scheduler. + +As an example consider an ensemble model for image classification and +segmentation that has the following model configuration: + +``` +name: "ensemble_model" +platform: "ensemble" +max_batch_size: 1 +input [ + { + name: "IMAGE" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "CLASSIFICATION" + data_type: TYPE_FP32 + dims: [ 1000 ] + }, + { + name: "SEGMENTATION" + data_type: TYPE_FP32 + dims: [ 3, 224, 224 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "image_preprocess_model" + model_version: -1 + input_map { + key: "RAW_IMAGE" + value: "IMAGE" + } + output_map { + key: "PREPROCESSED_OUTPUT" + value: "preprocessed_image" + } + }, + { + model_name: "classification_model" + model_version: -1 + input_map { + key: "FORMATTED_IMAGE" + value: "preprocessed_image" + } + output_map { + key: "CLASSIFICATION_OUTPUT" + value: "CLASSIFICATION" + } + }, + { + model_name: "segmentation_model" + model_version: -1 + input_map { + key: "FORMATTED_IMAGE" + value: "preprocessed_image" + } + output_map { + key: "SEGMENTATION_OUTPUT" + value: "SEGMENTATION" + } + } + ] +} +``` + +The ensemble\_scheduling section indicates that the ensemble scheduler will be +used and that the ensemble model consists of three different models. Each +element in step section specifies the model to be used and how the inputs and +outputs of the model are mapped to tensor names recognized by the scheduler. For +example, the first element in step specifies that the latest version of +image\_preprocess\_model should be used, the content of its input "RAW\_IMAGE" +is provided by "IMAGE" tensor, and the content of its output +"PREPROCESSED\_OUTPUT" will be mapped to "preprocessed\_image" tensor for later +use. The tensor names recognized by the scheduler are the ensemble inputs, the +ensemble outputs and all values in the input\_map and the output\_map. + +The models composing the ensemble may also have dynamic batching +enabled. Since ensemble models are just routing the data between +composing models, Triton can take requests into an ensemble model +without modifying the ensemble's configuration to exploit the dynamic +batching of the composing models. + +Assuming that only the ensemble model, the preprocess model, the classification +model and the segmentation model are being served, the client applications will +see them as four different models which can process requests independently. +However, the ensemble scheduler will view the ensemble model as the following. + +![Ensemble Example](images/ensemble_example0.png) + +When an inference request for the ensemble model is received, the ensemble +scheduler will: + +1. Recognize that the "IMAGE" tensor in the request is mapped to input + "RAW\_IMAGE" in the preprocess model. + +2. Check models within the ensemble and send an internal request to the + preprocess model because all the input tensors required are ready. + +3. Recognize the completion of the internal request, collect the output + tensor and map the content to "preprocessed\_image" which is an unique name + known within the ensemble. + +4. Map the newly collected tensor to inputs of the models within the ensemble. + In this case, the inputs of "classification\_model" and "segmentation\_model" + will be mapped and marked as ready. + +5. Check models that require the newly collected tensor and send internal + requests to models whose inputs are ready, the classification + model and the segmentation model in this case. Note that the responses will + be in arbitrary order depending on the load and computation time of + individual models. + +6. Repeat step 3-5 until no more internal requests should be sent, and then + response to the inference request with the tensors mapped to the ensemble + output names. + +Unlike other models, ensemble models do not support "instance_group" field in +the model configuration. The reason is that the ensemble scheduler itself +is mainly an event-driven scheduler with very minimal overhead so its +almost never the bottleneck of the pipeline. The composing models +within the ensemble can be individually scaled up or down with their +respective `instance_group` settings. To optimize your model pipeline +performance, you can use +[Model Analyzer](https://github.com/triton-inference-server/model_analyzer) +to find the optimal model configurations. + +#### Additional Resources + +You can find additional end-to-end ensemble examples in the links below: +* [This guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_5-Model_Ensembles) +explores the concept of ensembles with a running example. +* [Preprocessing in Python Backend Using + Ensemble](https://github.com/triton-inference-server/python_backend#preprocessing) +* [Accelerating Inference with NVIDIA Triton Inference Server and NVIDIA + DALI](https://developer.nvidia.com/blog/accelerating-inference-with-triton-inference-server-and-dali/) +* [Using RAPIDS AI with NVIDIA Triton Inference + Server](https://github.com/rapidsai/rapids-examples/tree/main/rapids_triton_example) + diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md new file mode 100644 index 0000000000..88a7037c7f --- /dev/null +++ b/docs/user_guide/custom_operations.md @@ -0,0 +1,195 @@ + + +# Custom Operations + +Modeling frameworks that allow custom operations are partially +supported by the Triton Inference Server. Custom operations can be +added to Triton at build time or at startup and are made available to +all loaded models. + +## TensorRT + +TensorRT allows a user to create [custom +layers](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#extending) +which can then be used in TensorRT models. For those models to run in +Triton the custom layers must be made available. + +To make the custom layers available to Triton, the TensorRT custom +layer implementations must be compiled into one or more shared +libraries which must then be loaded into Triton using LD_PRELOAD. For +example, assuming your TensorRT custom layers are compiled into +libtrtcustom.so, starting Triton with the following command makes +those custom layers available to all TensorRT models. + +```bash +$ LD_PRELOAD=libtrtcustom.so:${LD_PRELOAD} tritonserver --model-repository=/tmp/models ... +``` + +A limitation of this approach is that the custom layers must be +managed separately from the model repository itself. And more +seriously, if there are custom layer name conflicts across multiple +shared libraries there is currently no way to handle it. + +When building the custom layer shared library it is important to use +the same version of TensorRT as is being used in Triton. You can find +the TensorRT version in the [Triton Release +Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html). A +simple way to ensure you are using the correct version of TensorRT is +to use the [NGC TensorRT +container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt) +corresponding to the Triton container. For example, if you are using +the 24.09 version of Triton, use the 24.09 version of the TensorRT +container. + +## TensorFlow + +TensorFlow allows users to [add custom +operations](https://www.tensorflow.org/guide/create_op) which can then +be used in TensorFlow models. You can load custom TensorFlow operations +into Triton in two ways: +* At model load time, by listing them in the model configuration. +* At server launch time, by using LD_PRELOAD. + +To register your custom operations library via the the model configuration, +you can include it as an additional field. See the below configuration as an example. + +```bash +$ model_operations { op_library_filename: "path/to/libtfcustom.so" } +``` + +Note that even though the models are loaded at runtime, multiple models can use the custom +operators. There is currently no way to deallocate the custom operators, so they will stay +available until Triton is shut down. + +You can also register your custom operations library via LD_PRELOAD. For example, +assuming your TensorFlow custom operations are compiled into libtfcustom.so, +starting Triton with the following command makes those operations +available to all TensorFlow models. + +```bash +$ LD_PRELOAD=libtfcustom.so:${LD_PRELOAD} tritonserver --model-repository=/tmp/models ... +``` + +With this approach, all TensorFlow custom operations depend on a TensorFlow shared +library that must be available to the custom shared library when it is +loading. In practice, this means that you must make sure that +/opt/tritonserver/backends/tensorflow1 or +/opt/tritonserver/backends/tensorflow2 is on the library path before +issuing the above command. There are several ways to control the +library path and a common one is to use the LD_LIBRARY_PATH. You can +set LD_LIBRARY_PATH in the "docker run" command or inside the +container. + +```bash +$ export LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow1:$LD_LIBRARY_PATH +``` + +A limitation of this approach is that the custom operations must be +managed separately from the model repository itself. And more +seriously, if there are custom layer name conflicts across multiple +shared libraries there is currently no way to handle it. + +When building the custom operations shared library it is important to +use the same version of TensorFlow as is being used in Triton. You can +find the TensorFlow version in the [Triton Release +Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html). A +simple way to ensure you are using the correct version of TensorFlow +is to use the [NGC TensorFlow +container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) +corresponding to the Triton container. For example, if you are using +the 24.09 version of Triton, use the 24.09 version of the TensorFlow +container. + +## PyTorch + +Torchscript allows users to [add custom +operations](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html) +which can then be used in Torchscript models. By using LD_PRELOAD you +can load your custom C++ operations into Triton. For example, if you +follow the instructions in the +[pytorch/extension-script](https://github.com/pytorch/extension-script) +repository and your Torchscript custom operations are compiled into +libpytcustom.so, starting Triton with the following command makes +those operations available to all PyTorch models. Since all Pytorch +custom operations depend on one or more PyTorch shared libraries +that must be available to the custom shared library when it is +loading. In practice this means that you must make sure that +/opt/tritonserver/backends/pytorch is on the library path while +launching the server. There are several ways to control the library path +and a common one is to use the LD_LIBRARY_PATH. + +```bash +$ LD_LIBRARY_PATH=/opt/tritonserver/backends/pytorch:$LD_LIBRARY_PATH LD_PRELOAD=libpytcustom.so:${LD_PRELOAD} tritonserver --model-repository=/tmp/models ... +``` + +A limitation of this approach is that the custom operations must be +managed separately from the model repository itself. And more +seriously, if there are custom layer name conflicts across multiple +shared libraries or the handles used to register them in PyTorch there +is currently no way to handle it. + +Starting with the 20.07 release of Triton the [TorchVision +operations](https://github.com/pytorch/vision) will be included with +the PyTorch backend and hence they do not have to be explicitly added +as custom operations. + +When building the custom operations shared library it is important to +use the same version of PyTorch as is being used in Triton. You can +find the PyTorch version in the [Triton Release +Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html). A +simple way to ensure you are using the correct version of PyTorch is +to use the [NGC PyTorch +container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) +corresponding to the Triton container. For example, if you are using +the 24.09 version of Triton, use the 24.09 version of the PyTorch +container. + +## ONNX + +ONNX Runtime allows users to [add custom +operations](https://onnxruntime.ai/docs/reference/operators/add-custom-op.html) +which can then be used in ONNX models. To register your custom +operations library you need to include it in the model configuration +as an additional field. For example, if you follow [this +example](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/test/shared_lib/test_inference.cc) +from the +[microsoft/onnxruntime](https://github.com/microsoft/onnxruntime) +repository and your ONNXRuntime custom operations are compiled into +libonnxcustom.so, adding the following to the model configuration of +your model makes those operations available to that specific ONNX +model. + +```bash +$ model_operations { op_library_filename: "/path/to/libonnxcustom.so" } +``` + +When building the custom operations shared library it is important to +use the same version of ONNXRuntime as is being used in Triton. You +can find the ONNXRuntime version in the [Triton Release +Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html). diff --git a/docs/user_guide/debugging_guide.md b/docs/user_guide/debugging_guide.md new file mode 100644 index 0000000000..e5b0263d30 --- /dev/null +++ b/docs/user_guide/debugging_guide.md @@ -0,0 +1,151 @@ + + +# Debugging Guide +This guide goes over first-step troubleshooting for common scenarios in which Triton is behaving unexpectedly or failing. Below, we break down the issues into these categories: + +- **[Configuration](#configuration-issues)**: Triton reports an error with your configuration file. +- **[Model](#model-issues)**: Your model fails to load or perform inference. +- Server: The server is crashing or unavailable. +- Client: The client is failing in sending and receiving data to the server. +- Performance: Triton is not achieving optimal performance. + +Regardless of the category of your issue, it is worthwhile to try running in the latest Triton container, whenever possible. While we provide support to older containers, fixes get merged into the next release. By checking the latest release, you can spot whether this issue has already been resolved. + +You can also search [Triton’s GitHub issues](https://github.com/triton-inference-server/server/issues) to see if someone previously asked about your issue. If you received an error, you can use a few keywords from the error as a search term. + +Triton provides different types of errors and statuses, relevant across a wide swath of issues. Here is an overview of them: + +| Error | Definition | Example | +| ----- | ---------- | ------- | +|Already Exists | Returned when an action cannot be done because there is already an existing item. | A registered model fails to be registered again.| +| Internal | Returned when there is an unexpected failure within the Triton code. | A memory allocation fails. | +| Invalid Arg | Returned when an invalid argument is provided to a function | A model config has an invalid parameter | +| Not Found | Returned when a requested resource is unable to be found | A shared library is unable to be found | +| Unavailable | Returned when a requested resource is found but unavailable | A requested model is not ready for inference | +| Unknown | Returned for cases where the reason for the error is unknown | This error code should not be used | +| Unsupported | Returned when an option is unsupported | A model config includes a parameter that is not yet supported for that backend | + +## Configuration Issues + +Before proceeding, please see if the model configuration documentation [here](./model_configuration.md) resolves your question. Beyond that, the best places to find a sample model configuration for your use cases are: + +- The server [qa folder](https://github.com/triton-inference-server/server/tree/main/qa). You can find test scripts covering most features, including some which update the model config files to do so. + - [Custom_models](https://github.com/triton-inference-server/server/tree/main/qa/custom_models), [ensemble_models](https://github.com/triton-inference-server/server/tree/main/qa/ensemble_models), and [python_models](https://github.com/triton-inference-server/server/tree/main/qa/python_models) include examples of configs for their respective use cases. + - [L0_model_config](https://github.com/triton-inference-server/server/tree/main/qa/L0_model_config) tests many types of incomplete model configs. + +Note that if you are running into an issue with [perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) or [Model Analyzer](https://github.com/triton-inference-server/model_analyzer), try loading the model onto Triton directly. This checks if the configuration is incorrect or the perf_analyzer or Model Analyzer options need to be updated. + +## Model Issues +**Step 1. Run Models Outside of Triton** + +If you are running into an issue with loading or running a model, the first step is to ensure your model runs in its framework outside of Triton. For example, you can run ONNX models in ONNX Runtime and TensorRT models in trtexec. If this check fails, the issue is happening within the framework and not within Triton. + +**Step 2. Find the Error Message** + +If you receive an error message, you may be able to find where it was generated by searching the code. GitHub provides instructions for searching code [here](https://docs.github.com/en/search-github/searching-on-github/searching-code). A generic search through the Triton organization is available at [this link](https://github.com/search?q=org%3Atriton-inference-server&type=Code). + +If your error message only occurs in one or a few places in the Triton code, you may be able to see what’s going wrong pretty quickly. Even if not, it’s good to save this link to provide to us when asking for help with your issue. This is often the first thing we look for. + +**Step 3. Build with Debug Flags** + +The next step is building with debug flags. We unfortunately don’t provide a debug container, so you’d need to follow the [build guide](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/build.md) to build the container, which includes a [section on adding debug symbols](https://github.com/triton-inference-server/server/blob/main/docs/build.md#building-with-debug-symbols). Once you do so, you can install GDB (`apt-get install gdb`) in the container and run Triton in GDB (`gdb --args tritonserver…`). If needed, you can open a second terminal to run a script in another container. If the server segfaults, you can enter `backtrace`, which will provide you a call stack that lets you know where the error got generated. You should then be able to trace the source of the error. If the bug still exists after debugging, we’ll need this to expedite our work. + +Advanced GDB users can also examine variable values, add breakpoints, and more to find the cause of their issue. + +### Specific Issues +**Undefined Symbols** + +There are a few options here: +- This often means a version mismatch between the version of a framework used by Triton and the one used to create the model. Check the version of the framework used in the Triton container and compare against the version used to generate the model. +- If you are loading a shared library used by a backend, don’t forget to include LD_PRELOAD before the command to run Tritonserver.  + - `LD_PRELOAD= tritonserver --model-repository…` +If you built the backend yourself, this could be a linking error. If you are confident the backends and server were built correctly, double check that the server is loading the correct backend. + +## Server Issues + +You generally should not run into errors with the server itself. If the server goes down, it’s usually because something went wrong during model loading or inference and you can use the above section to debug. It’s particularly useful to work through the [Building with Debug Flags](https://github.com/triton-inference-server/server/blob/main/docs/build.md#building-with-debug-symbols) section above to resolve those sorts of issues. However, this section will go through some specific cases that may occur. + +### No Connection to Server + +If you are having trouble connecting to the server or getting its health via the health endpoint (`curl -v localhost:8000/v2/health/ready`), make sure you are able to reach the network your server is running on from where you are running your command. Most commonly, we see that when separate Docker containers are started for the client and server, they are not started with [--net=host](https://docs.docker.com/network/host/) to share the network. + +### Intermittent Failure + +This is going to be one of the hardest things to debug. If possible, you want to build your server with debug flags to get a backtrace of what is happening specifically. You would also want to keep notes to see how often this happens and whether that is a common cause. The server itself should not fail while idling, so see if a certain action (loading/unloading a model, running a model inference, etc.) is triggering it. + +### Server Failure Due to Individual Models + +If you want the server to start up even when models fail, use the `exit-on-error=false` option. If you want the server health endpoint to show ready even when specific models fail, use the `--strict-readiness=false` flag. + +### Deadlock + +Some useful steps for debugging a deadlock with `gdb`: +1. Use `$info threads` to see which threads are waiting. +2. Go to a thread: `$thread 4`. +3. Print the backtrace: `$bt`. +4. Go to the frame with the lock: `$f 1`. +5. Print the memory of the mutex being held: `$p *mutex`. +6. You can now see the owner of the mutex under `owner`. + +## Client Issues + +For working with different client cases, the best resources are the [client repo’s](https://github.com/triton-inference-server/client) examples. You can see clients written in Python, Java, and C++ with running examples across many common use cases. You can review the main functions of these clients to get a sense of the flow of the code. + +We often get performance optimization questions around the clients. Triton clients send input tensors as raw binary. However, GRPC uses protobuf which has some serialization and deserialization overhead. For those looking for the lowest-latency solution, C API eliminates the latency associated with GRPC/HTTP. Shared memory is also a good option to reduce data movement when the client and server are on the same system. + +## Performance Issues + +This section goes over debugging unexpected performance. If you are looking to optimize performance, please see the [Optimization](https://github.com/triton-inference-server/server/blob/main/docs/optimization.md) and [Performance Tuning](https://github.com/triton-inference-server/server/blob/main/docs/performance_tuning.md) guides. + +The easiest step to start with is running perf_analyzer to get a breakdown of the request lifecycle, throughput, and latency for each individual model. For a more detailed view, you can [enable tracing](https://github.com/triton-inference-server/server/blob/main/docs/trace.md) when running the server. This will provide exact timestamps to drill down into what is happening. You can also enable tracing with perf_analyzer for the GRPC and HTTP clients by using the tracing flags. Note that enabling tracing can impact Triton’s performance, but it can be helpful to examine the timestamps throughout a request’s lifecycle. + +### Performance Profiling + +The next step would be to use a performance profiler. One profiler we recommend is [Nsight Systems](https://developer.nvidia.com/nsight-systems) (nsys), optionally including NVIDIA Tools Extension (NVTX) markers to profile Triton. + +The Triton server container already has nsys installed. However, Triton does not build with the NVTX markers by default. If you want to use NVTX markers, you should build Triton with build.py, using the “--enable-nvtx†flag. This will provide details around some phases of processing a request, such as queueing, running inference, and handling outputs. + +You can profile Triton by running `nsys profile tritonserver --model-repository …`. The [nsys documentation](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) provides more options and details for getting a thorough overview of what is going on. + +## Submitting an Issue + +If you’ve done the initial debugging steps with no results, the next step is to submit the issue to us. Before you do so, please answer these questions: +- Is this reproducible with multiple models and/or our example models? Or is the issue unique to your model? +- Is the bug reproducible with any protocol (ex: HTTP vs GRPC)? Or only one protocol? + +The answers to the above should inform what you submit. If you find that this issue only happens under specific circumstances, please include this in your report. If the issue still exists, please submit **all** of the below: + +- The commands or script used to build/pull Triton and run your models. + - If building Triton, please provide the version or branch you are building from. +- Your model configuration file. +- The error received, plus any logs. + - If your issue involves the server crashing, a backtrace of the dump would be helpful. + - Please enable verbose logging (--verbose-log=1) to get the most detailed logs. +- If this issue is unique to your model, your model or a toy model that reproduces the issue. +- Anything else that would expedite our investigation. diff --git a/docs/user_guide/decoupled_models.md b/docs/user_guide/decoupled_models.md new file mode 100644 index 0000000000..fbe6f4c298 --- /dev/null +++ b/docs/user_guide/decoupled_models.md @@ -0,0 +1,127 @@ + + +# Decoupled Backends and Models + +Triton can support [backends](https://github.com/triton-inference-server/backend) +and models that send multiple responses for a request or zero responses +for a request. A decoupled model/backend may also send responses out-of-order +relative to the order that the request batches are executed. This allows +backend to deliver response whenever it deems fit. This is specifically +useful in Automated Speech Recognition (ASR). The requests with large number +of responses, will not block the responses from other requests from being +delivered. + +## Developing Decoupled Backend/Model + +### C++ Backend + +Read carefully about the [Triton Backend API](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api), +[Inference Requests and Responses](https://github.com/triton-inference-server/backend/blob/main/README.md#inference-requests-and-responses) +and [Decoupled Responses](https://github.com/triton-inference-server/backend/blob/main/README.md#decoupled-responses). +The [repeat backend](https://github.com/triton-inference-server/repeat_backend) +and [square backend](https://github.com/triton-inference-server/square_backend) +demonstrate how the Triton Backend API can be used to implement a decoupled +backend. The example is designed to show the flexibility of the Triton API +and in no way should be used in production. This example may process multiple +batches of requests at the same time without having to increase the +[instance count](model_configuration.md#instance-groups). In real deployment, +the backend should not allow the caller thread to return from +TRITONBACKEND_ModelInstanceExecute until that instance is ready to +handle another set of requests. If not designed properly the backend +can be easily over-subscribed. This can also cause under-utilization +of features like [Dynamic Batching](model_configuration.md#dynamic-batcher) +as it leads to eager batching. + +### Python model using Python Backend + +Read carefully about the [Python Backend](https://github.com/triton-inference-server/python_backend), +and specifically [`execute`](https://github.com/triton-inference-server/python_backend#execute). + +The [decoupled examples](https://github.com/triton-inference-server/python_backend/tree/main/examples/decoupled) +demonstrates how decoupled API can be used to implement a decoupled +python model. As noted in the examples, these are designed to show +the flexibility of the decoupled API and in no way should be used +in production. + + +## Deploying Decoupled Models + +The [decoupled model transaction policy](model_configuration.md#decoupled) +must be set in the provided [model configuration](model_configuration.md) +file for the model. Triton requires this information to enable special +handling required for decoupled models. Deploying decoupled models without +this configuration setting will throw errors at the runtime. + +## Running Inference on Decoupled Models + +[Inference Protocols and APIs](../customization_guide/inference_protocols.md) describes various ways +a client can communicate and run inference on the server. For decoupled models, +Triton's HTTP endpoint cannot be used for running inference as it supports +exactly one response per request. Even standard ModelInfer RPC in the GRPC endpoint +does not support decoupled responses. In order to run inference on a decoupled +model, the client must use the bi-directional streaming RPC. See +[here](https://github.com/triton-inference-server/common/blob/main/protobuf/grpc_service.proto) +for more details. The [decoupled_test.py](../../qa/L0_decoupled/decoupled_test.py) demonstrates +how the gRPC streaming can be used to infer decoupled models. + +If using [Triton's in-process C API](../customization_guide/inference_protocols.md#in-process-triton-server-api), +your application should be cognizant that the callback function you registered with +`TRITONSERVER_InferenceRequestSetResponseCallback` can be invoked any number of times, +each time with a new response. You can take a look at [grpc_server.cc](https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc) + +### Knowing When a Decoupled Inference Request is Complete + +An inference request is considered complete when a response containing the +`TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag is received from a model/backend. + +1. Client applications using streaming GRPC can access this information by + checking the response parameters for the `"triton_final_response"` parameter. + Decoupled models may not send a response for each request depending on how + the model/backend is designed. In these cases where no response is sent by + the backend, the streaming GRPC client can opt-in to receive an empty final + response for each request. By default, empty final responses are not sent to + save on network traffic. + + ```python + # Example of streaming GRPC client opting-in + client.async_stream_infer( + ..., + enable_empty_final_response=True + ) + ``` + +2. Client applications using the C API can check the + `TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag directly in their response + handling / callback logic. + +The [decoupled_test.py](../../qa/L0_decoupled/decoupled_test.py) +demonstrates an example of opting-in through the streaming GRPC +Python client API and programmatically identifying when a final response +is received through the `"triton_final_response"` response parameter. + diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md new file mode 100644 index 0000000000..2381b1d9b9 --- /dev/null +++ b/docs/user_guide/faq.md @@ -0,0 +1,205 @@ + + +# FAQ + +## What are the advantages of running a model with Triton Inference Server compared to running directly using the model's framework API? + +When using Triton Inference Server the inference result will be the +same as when using the model's framework directly. However, with +Triton you get benefits like [concurrent model +execution](architecture.md#concurrent-model-execution) (the ability to +run multiple models at the same time on the same GPU) and [dynamic +batching](model_configuration.md#dynamic-batcher) to get better +throughput. You can also [replace or upgrade models while Triton and +client application are running](model_management.md). Another benefit +is that Triton can be deployed as a Docker container, anywhere – on +premises and on public clouds. Triton Inference Server also [supports +multiple +frameworks](https://github.com/triton-inference-server/backend) such +as TensorRT, TensorFlow, PyTorch, and ONNX on both GPUs and CPUs +leading to a streamlined deployment. + +## Can Triton Inference Server run on systems that don't have GPUs? + +Yes, the QuickStart guide describes how to [run Triton on a CPU-Only +System](../getting_started/quickstart.md#run-on-cpu-only-system). + +## Can Triton Inference Server be used in non-Docker environments? + +Yes. Triton Inference Server can also be [built from +source](../customization_guide/build.md#building-without-docker) on your "bare metal" +system. + +## Do you provide client libraries for languages other than C++ and Python? + +We provide C++ and Python client libraries to make it easy for users +to write client applications that communicate with Triton. We chose +those languages because they were likely to be popular and performant +in the ML inference space, but in the future we can possibly add other +languages if there is a need. + +We provide the GRPC API as a way to generate your own client library +for a large number of languages. By following the official GRPC +documentation and using +[grpc_service.proto](https://github.com/triton-inference-server/common/blob/main/protobuf/grpc_service.proto) +you can generate language bindings for all the languages supported by +GRPC. We provide three examples of this for +[Go](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/go), +[Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples/grpc_client.py) and +[Java](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/java). + +In general the client libraries (and client examples) are meant to be +just that, examples. We feel the client libraries are well written and +well tested, but they are not meant to serve every possible use +case. In some cases you may want to develop your own customized +library to suit your specific needs. + +## How would you use Triton Inference Server within the AWS environment? + +In an AWS environment, the Triton Inference Server docker container +can run on [CPU-only instances or GPU compute +instances](../getting_started/quickstart.md#launch-triton). Triton can run directly on the +compute instance or inside Elastic Kubernetes Service (EKS). In +addition, other AWS services such as Elastic Load Balancer (ELB) can +be used for load balancing traffic among multiple Triton +instances. Elastic Block Store (EBS) or S3 can be used for storing +deep-learning models loaded by the inference server. + +## How do I measure the performance of my model running in the Triton Inference Server? + +The Triton Inference Server exposes performance information in two +ways: by [Prometheus metrics](metrics.md) and by the statistics +available through the [HTTP/REST, GRPC, and C +APIs](../customization_guide/inference_protocols.md). + +A client application, +[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md), +allows you to measure the performance of an individual model using a synthetic +load. The perf_analyzer application is designed to show you the tradeoff of +latency vs. throughput. + +## How can I fully utilize the GPU with Triton Inference Server? + +Triton Inference Server has several features designed to increase +GPU utilization: + +* Triton can [simultaneously perform inference for multiple + models](architecture.md#concurrent-model-execution) (using either + the same or different frameworks) using the same GPU. + +* Triton can increase inference throughput by using [multiple +instances of the same +model](architecture.md#concurrent-model-execution) to handle multiple +simultaneous inferences requests to that model. Triton chooses +reasonable defaults but [you can also control the exact level of +concurrency](model_configuration.md#instance-groups) on a +model-by-model basis. + +* Triton can [batch together multiple inference requests into a single + inference execution](model_configuration.md#dynamic-batcher). Typically, + batching inference requests leads to much higher thoughput with only + a relatively small increase in latency. + +As a general rule, batching is the most beneficial way to increase GPU +utilization. So you should always try enabling the [dynamic +batcher](model_configuration.md#dynamic-batcher) with your models. Using +multiple instances of a model can also provide some benefit but is +typically most useful for models that have small compute +requirements. Most models will benefit from using two instances but +more than that is often not useful. + +## If I have a server with multiple GPUs should I use one Triton Inference Server to manage all GPUs or should I use multiple inference servers, one for each GPU? + +Triton Inference Server will take advantage of all GPUs that it has +access to on the server. You can limit the GPUs available to Triton by +using the CUDA_VISIBLE_DEVICES environment variable (or with Docker +you can also use NVIDIA_VISIBLE_DEVICES or --gpus flag when launching +the container). When using multiple GPUs, Triton will distribute +inference request across the GPUs to keep them all equally +utilized. You can also [control more explicitly which models are +running on which GPUs](model_configuration.md#instance-groups). + +In some deployment and orchestration environments (for example, +Kubernetes) it may be more desirable to partition a single multi-GPU +server into multiple *nodes*, each with one GPU. In this case the +orchestration environment will run a different Triton for each GPU and +an load balancer will be used to divide inference requests across the +available Triton instances. + +## If the server segfaults, how can I debug it? + +The NGC build is a Release build and does not contain Debug symbols. +The build.py as well defaults to a Release build. Refer to the instructions +in [build.md](../customization_guide/build.md#building-with-debug-symbols) to create a Debug build +of Triton. This will help find the cause of the segmentation fault when +looking at the gdb trace for the segfault. + +When opening a GitHub issue for the segfault with Triton, please include +the backtrace to better help us resolve the problem. + +## What are the benefits of using [Triton Inference Server](https://developer.nvidia.com/triton-inference-server) as part of the [NVIDIA AI Enterprise Software Suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/)? + +NVIDIA AI Enterprise enables enterprises to implement full AI workflows by +delivering an entire end-to-end AI platform. Four key benefits: + +### Enterprise-Grade Support, Security & API Stability: + +Business-critical AI projects stay on track with NVIDIA Enterprise Support, +available globally to assist both IT teams with deploying and managing the +lifecycle of AI applications and the developer teams with building AI +applications. Support includes maintenance updates, dependable SLAs and +response times. Regular security reviews and priority notifications mitigate +potential risk of unmanaged opensource and ensure compliance with corporate +standards. Finally, long term support and regression testing ensures API +stability between releases. + +### Speed time to production with AI Workflows & Pretrained Models: +To reduce the complexity of developing common AI applications, NVIDIA AI +Enterprise includes +[AI workflows](https://www.nvidia.com/en-us/launchpad/ai/workflows/) which are +reference applications for specific business outcomes such as Intelligent +Virtual Assistants and Digital Fingerprinting for real-time cybersecurity threat +detection. AI workflow reference applications may include +[AI frameworks](https://docs.nvidia.com/deeplearning/frameworks/index.html) and +[pretrained models](https://developer.nvidia.com/ai-models), +[Helm Charts](https://catalog.ngc.nvidia.com/helm-charts), +[Jupyter Notebooks](https://developer.nvidia.com/run-jupyter-notebooks) and +[documentation](https://docs.nvidia.com/ai-enterprise/index.html#overview). + +### Performance for Efficiency and Cost Savings: +Using accelerated compute for AI workloads such as data process with +[NVIDIA RAPIDS Accelerator](https://developer.nvidia.com/rapids) for Apache +Spark and inference with Triton Inference Sever delivers better performance +which also improves efficiency and reduces operation and infrastructure costs, +including savings from reduced time and energy consumption. + +### Optimized and Certified to Deploy Everywhere: +Cloud, Data Center, Edge Optimized and certified to ensure reliable performance +whether it’s running your AI in the public cloud, virtualized data centers, or +on DGX systems. diff --git a/docs/user_guide/images/arch.jpg b/docs/user_guide/images/arch.jpg new file mode 100644 index 0000000000..733b9f169c Binary files /dev/null and b/docs/user_guide/images/arch.jpg differ diff --git a/docs/user_guide/images/dyna_sequence_example0.png b/docs/user_guide/images/dyna_sequence_example0.png new file mode 100644 index 0000000000..0872d4a25b Binary files /dev/null and b/docs/user_guide/images/dyna_sequence_example0.png differ diff --git a/docs/user_guide/images/dyna_sequence_example1.png b/docs/user_guide/images/dyna_sequence_example1.png new file mode 100644 index 0000000000..b20bcea5ed Binary files /dev/null and b/docs/user_guide/images/dyna_sequence_example1.png differ diff --git a/docs/user_guide/images/ensemble_example0.png b/docs/user_guide/images/ensemble_example0.png new file mode 100644 index 0000000000..7ff1f2fdfd Binary files /dev/null and b/docs/user_guide/images/ensemble_example0.png differ diff --git a/docs/user_guide/images/multi_model_exec.png b/docs/user_guide/images/multi_model_exec.png new file mode 100644 index 0000000000..77413112e6 Binary files /dev/null and b/docs/user_guide/images/multi_model_exec.png differ diff --git a/docs/user_guide/images/multi_model_parallel_exec.png b/docs/user_guide/images/multi_model_parallel_exec.png new file mode 100644 index 0000000000..ba690e808a Binary files /dev/null and b/docs/user_guide/images/multi_model_parallel_exec.png differ diff --git a/docs/user_guide/images/multi_model_serial_exec.png b/docs/user_guide/images/multi_model_serial_exec.png new file mode 100644 index 0000000000..fd5f92b04b Binary files /dev/null and b/docs/user_guide/images/multi_model_serial_exec.png differ diff --git a/docs/user_guide/images/sequence_example0.png b/docs/user_guide/images/sequence_example0.png new file mode 100644 index 0000000000..d46bac0987 Binary files /dev/null and b/docs/user_guide/images/sequence_example0.png differ diff --git a/docs/user_guide/images/sequence_example1.png b/docs/user_guide/images/sequence_example1.png new file mode 100644 index 0000000000..1c6b18b57e Binary files /dev/null and b/docs/user_guide/images/sequence_example1.png differ diff --git a/docs/user_guide/images/sequence_example2.png b/docs/user_guide/images/sequence_example2.png new file mode 100644 index 0000000000..b55611e3d9 Binary files /dev/null and b/docs/user_guide/images/sequence_example2.png differ diff --git a/docs/user_guide/images/triton_on_jetson.png b/docs/user_guide/images/triton_on_jetson.png new file mode 100644 index 0000000000..c54dd279b9 Binary files /dev/null and b/docs/user_guide/images/triton_on_jetson.png differ diff --git a/docs/user_guide/jetson.md b/docs/user_guide/jetson.md new file mode 100644 index 0000000000..e2b2b0ad34 --- /dev/null +++ b/docs/user_guide/jetson.md @@ -0,0 +1,215 @@ + + +# Triton Inference Server Support for Jetson and JetPack + +A release of Triton for [JetPack 5.0](https://developer.nvidia.com/embedded/jetpack) +is provided in the attached tar file in the [release notes](https://github.com/triton-inference-server/server/releases). + +![Triton on Jetson Diagram](images/triton_on_jetson.png) + +Triton Inference Server support on JetPack includes: + +* Running models on GPU and NVDLA +* [Concurrent model execution](architecture.md#concurrent-model-execution) +* [Dynamic batching](architecture.md#models-and-schedulers) +* [Model pipelines](architecture.md#ensemble-models) +* [Extensible backends](https://github.com/triton-inference-server/backend) +* [HTTP/REST and GRPC inference protocols](../customization_guide/inference_protocols.md) +* [C API](../customization_guide/inference_protocols.md#in-process-triton-server-api) + +Limitations on JetPack 5.0: + +* Onnx Runtime backend does not support the OpenVino and TensorRT execution providers. +The CUDA execution provider is in Beta. +* The Python backend does not support GPU Tensors and Async BLS. +* CUDA IPC (shared memory) is not supported. System shared memory however is supported. +* GPU metrics, GCS storage, S3 storage and Azure storage are not supported. + +On JetPack, although HTTP/REST and GRPC inference protocols are supported, for edge +use cases, direct [C API integration](../customization_guide/inference_protocols.md#in-process-triton-server-api) +is recommended. + +You can download the `.tgz` file for Jetson from the Triton Inference Server +[release page](https://github.com/triton-inference-server/server/releases) in the +_"Jetson JetPack Support"_ section. + +The `.tgz` file contains the Triton server executable and shared libraries, +as well as the C++ and Python client libraries and examples. + +## Installation and Usage + +### Build Dependencies for Triton + +The following dependencies must be installed before building Triton server: + +``` +apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common \ + autoconf \ + automake \ + build-essential \ + git \ + libb64-dev \ + libre2-dev \ + libssl-dev \ + libtool \ + libboost-dev \ + rapidjson-dev \ + patchelf \ + pkg-config \ + libopenblas-dev \ + libarchive-dev \ + zlib1g-dev \ + python3 \ + python3-dev \ + python3-pip +``` + +Additional Onnx Runtime dependencies must be installed to build the Onnx Runtime backend: + +``` +pip3 install --upgrade flake8 flatbuffers +``` + +Additional PyTorch dependencies must be installed to build (and run) the PyTorch backend: + +``` +apt-get -y install autoconf \ + bc \ + g++-8 \ + gcc-8 \ + clang-8 \ + lld-8 + +pip3 install --upgrade expecttest xmlrunner hypothesis aiohttp pyyaml scipy ninja typing_extensions protobuf +``` + +Apart from these PyTorch dependencies, the PyTorch wheel corresponding to the release must also be installed (for build and runtime): + +``` +pip3 install --upgrade https://developer.download.nvidia.com/compute/redist/jp/v50/pytorch/torch-1.12.0a0+2c916ef.nv22.3-cp38-cp38-linux_aarch64.whl +``` + +The following dependencies must be installed before building Triton client libraries/examples: + +``` +apt-get install -y --no-install-recommends \ + curl \ + jq + +pip3 install --upgrade wheel setuptools cython && \ + pip3 install --upgrade grpcio-tools "numpy<2" attrdict pillow +``` + +**Note**: OpenCV 4.2.0 is installed as a part of JetPack. It is one of the dependencies for the client build. + +**Note**: When building Triton on Jetson, you will require a recent version of cmake. +We recommend using cmake 3.25.2. Below is a script to upgrade your cmake version to 3.25.2. + +``` +apt remove cmake +# Using CMAKE installation instruction from:: https://apt.kitware.com/ +apt update && apt install -y gpg wget && \ + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | \ + gpg --dearmor - | \ + tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \ + . /etc/os-release && \ + echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \ + tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ + apt-get update && \ + apt-get install -y --no-install-recommends cmake cmake-data +``` + +### Runtime Dependencies for Triton + +The following runtime dependencies must be installed before running Triton server: + +``` +apt-get update && \ + apt-get install -y --no-install-recommends \ + libb64-0d \ + libre2-9 \ + libssl1.1 \ + rapidjson-dev \ + libopenblas-dev \ + libarchive-dev \ + zlib1g \ + python3 \ + python3-dev \ + python3-pip +``` + +The following runtime dependencies must be installed before running Triton client: + +``` +apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + jq + +pip3 install --upgrade wheel setuptools && \ + pip3 install --upgrade grpcio-tools "numpy<2" attrdict pillow +``` + +The PyTorch runtime dependencies are the same as the build dependencies listed above. + +### Usage + +**Note**: The PyTorch backend depends on libomp.so, which is not loaded automatically. +If using the PyTorch backend in Triton, you need to set the LD_LIBRARY_PATH to allow +libomp.so to be loaded as needed before launching Triton. + +``` +LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/lib/llvm-8/lib" +``` + +**Note**: On Jetson, the backend directory must be explicitly specified using the +`--backend-directory` flag. Starting from 23.04, Triton no longer supports +TensorFlow 1.x. If you'd like to use TensorFlow 1.x with Triton prior to 23.04, +a version string is required to use TensorFlow 1.x. + +``` +tritonserver --model-repository=/path/to/model_repo --backend-directory=/path/to/tritonserver/backends \ + --backend-config=tensorflow,version=2 +``` + +**Note**: +[perf_analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) +is supported on Jetson, while the [model_analyzer](model_analyzer.md) is +currently not available for Jetson. To execute `perf_analyzer` for C API, use +the CLI flag `--service-kind=triton_c_api`: + +```shell +perf_analyzer -m graphdef_int32_int32_int32 --service-kind=triton_c_api \ + --triton-server-directory=/opt/tritonserver \ + --model-repository=/workspace/qa/L0_perf_analyzer_capi/models +``` + +Refer to these [examples](../examples/jetson/README.md) that demonstrate how to use Triton Inference Server on Jetson. diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md new file mode 100644 index 0000000000..b8fc0d8ee0 --- /dev/null +++ b/docs/user_guide/metrics.md @@ -0,0 +1,386 @@ + + +# Metrics + +Triton provides [Prometheus](https://prometheus.io/) metrics +indicating GPU and request statistics. By default, these metrics are +available at http://localhost:8002/metrics. The metrics are only +available by accessing the endpoint, and are not pushed or published +to any remote server. The metric format is plain text so you can view +them directly, for example: + +``` +$ curl localhost:8002/metrics +``` + +The `tritonserver --allow-metrics=false` option can be used to disable +all metric reporting, while the `--allow-gpu-metrics=false` and +`--allow-cpu-metrics=false` can be used to disable just the GPU and CPU +metrics respectively. + +The `--metrics-port` option can be used to select a different port. By default, +Triton reuses the `--http-address` option for the metrics endpoint and binds the +http and metrics endpoints to the same specific address when http service is +enabled. If http service is not enabled, the metric address will bind to `0.0.0.0` +by default. To uniquely specify the metric endpoint, `--metrics-address` option +can be used. See the `tritonserver --help` output for more info on these CLI options. + +To change the interval at which metrics are polled/updated, see the `--metrics-interval-ms` flag. Metrics that are updated "Per Request" are unaffected by this interval setting. This interval only applies to metrics that are designated as "Per Interval" in the tables of each section below: + +- [Inference Request Metrics](#inference-request-metrics) +- [GPU Metrics](#gpu-metrics) +- [CPU Metrics](#cpu-metrics) +- [Pinned Memory Metrics](#pinned-memory-metrics) +- [Response Cache Metrics](#response-cache-metrics) +- [Custom Metrics](#custom-metrics) + +## Inference Request Metrics + +### Counts + +For models that do not support batching, *Request Count*, *Inference +Count* and *Execution Count* will be equal, indicating that each +inference request is executed separately. + +For models that support batching, the count metrics can be interpreted +to determine average batch size as *Inference Count* / *Execution +Count*. The count metrics are illustrated by the following examples: + +* Client sends a single batch-1 inference request. *Request Count* = + 1, *Inference Count* = 1, *Execution Count* = 1. + +* Client sends a single batch-8 inference request. *Request Count* = + 1, *Inference Count* = 8, *Execution Count* = 1. + +* Client sends 2 requests: batch-1 and batch-8. Dynamic batcher is not + enabled for the model. *Request Count* = 2, *Inference Count* = 9, + *Execution Count* = 2. + +* Client sends 2 requests: batch-1 and batch-1. Dynamic batcher is + enabled for the model and the 2 requests are dynamically batched by + the server. *Request Count* = 2, *Inference Count* = 2, *Execution + Count* = 1. + +* Client sends 2 requests: batch-1 and batch-8. Dynamic batcher is + enabled for the model and the 2 requests are dynamically batched by + the server. *Request Count* = 2, *Inference Count* = 9, *Execution + Count* = 1. + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|--------------|----------------|------------|---------------------------|-----------|-------------| +|Count |Success Count |`nv_inference_request_success` |Number of successful inference requests received by Triton (each request is counted as 1, even if the request contains a batch) |Per model |Per request | +| |Failure Count |`nv_inference_request_failure` |Number of failed inference requests received by Triton (each request is counted as 1, even if the request contains a batch) |Per model |Per request | +| |Inference Count |`nv_inference_count` |Number of inferences performed (a batch of "n" is counted as "n" inferences, does not include cached requests)|Per model|Per request| +| |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request| +| |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request| + +#### Failure Count Categories + +| Failed Request Reason |Description | +|------------|------------| +| REJECTED | Number of inference failures due to request timeout in the scheduler. | +| CANCELED | Number of inference failures due to request cancellation in the core. | +| BACKEND | Number of inference failures during execution of requests in the backend/model. | +| OTHER | Number of inference failures due to other uncategorized reasons in the core. | + +> **Note** +> +> Ensemble failure metrics will reflect the failure counts of their composing models as well as the parent model, but currently do not capture the same granularity for the "reason" label and will default to the "OTHER" reason. +> +> For example, if EnsembleA contains ModelA, and ModelA experiences a failed request due to a queue/backlog timeout in the scheduler, ModelA will have a failed request metric reflecting `reason=REJECTED` and `count=1`. +> Additionally, EnsembleA will have a failed request metric reflecting `reason=OTHER` and `count=2`. +> The `count=2` reflects 1 from the internally failed request captured by ModelA, as well as 1 from the failed top-level request sent to EnsembleA by the user/client. +> The `reason=OTHER` reflects that fact that the ensemble doesn't currently capture the specific reason why +> ModelA's request failed at this time. + +#### Pending Request Count (Queue Size) Per-Model + +The *Pending Request Count* reflects the number of requests that have been +received by Triton core via `TRITONSERVER_InferAsync`, but have not yet +started execution by a backend model instance +(`TRITONBACKEND_ModelInstanceExecute`). + +For all intents and purposes, the +"pending request count" and "queue size" per-model can be used +interchangeably, and the number reflected in the metric should +intuitively represent the number of requests that are not currently +being executed by any model instances. In simple terms, if you send a 100 +requests to a model that can only handle 5 requests concurrently, then you +should see a pending count of 95 for that model in most cases. + +For those interested in more technical details, the term "pending request count" +is a bit more accurate than "queue size" because Triton is highly configurable, +and there are many places in Triton that a request be considered pending rather +than a single queue. Some of the most common will be called out below: +- Default Scheduler backlogs any requests not currently executing. + - Assuming 1 available model instance with the default scheduler settings, + and 10 requests are sent in rapid succession. + - The 1st request should be picked up for + execution immediately, and the remaining 9 requests should be considered + pending for this model, until the 1st request is finished. Afterwards, the + next request should be picked up and the pending count should be decremented + to 8, and so on until all requests are finished and the pending count is 0. +- Dynamic Batcher queue for dynamically creating batches from requests. + - Assuming 1 available model instance with the dynamic batch scheduler + configured with `max_batch_size: 4` and a sufficiently large + `max_queue_delay_microseconds` (or queue of requests), + and 10 requests are sent in rapid succession. + - The first 4 requests, or as large of a batch the scheduler could form, + should be picked up for execution immediately, and the remaining 6 requests + should be considered pending. After the batch finishes, the next batch + should be picked up, decrementing the pending count again to 2 pending. + Then finally since only 2 requests remain, the final 2 requests will be + batched and picked up by the backend, decrementing the pending count to 0. +- Sequence Batcher queues and backlogs for ongoing sequence requests, some may + be assigned sequence slots, some may not. + - Sequence Batchers of both strategies (direct and oldest) will have pending + counts that generally follow the same trend as the dynamic batching + description above. The sequence batchers will immediately execute as many + requests in a batch as it can based on the model/scheduler config settings, + and any further requests will be considered pending until the previous batch + finishes and the next batch can start. +- Rate Limiter queues for prepared batches of requests. + - When rate limiting is enabled, requests can be held back from execution + to satisfy the rate limit constraints that were configured. + +There are some places where a request would not be considered pending: +- Ensemble Scheduler + - The Ensemble Scheduler almost immediately enqueues any requests it receives + into the composing model schedulers at the first step in the ensemble. + Therefore, the requests could be considered pending by the composing model + scheduler's, however from the ensemble's perspective, these requests have been + scheduled. +- Frontends (HTTP/GRPC Servers) + - Any requests sent from a client to a frontend server in-front of Triton + may spend some time in the corresponding server's code mapping + protocol-specific metadata to Triton metadata. Though this time is + generally brief, it will not be considered pending from Triton's + perspective until Triton core has received the request from the frontend. + +### Latencies + +Starting in 23.04, Triton exposes the ability to choose the types of metrics +that are published through the `--metrics-config` CLI options. + +#### Counters + +By default, the following +[Counter](https://prometheus.io/docs/concepts/metric_types/#counter) +metrics are used for latencies: + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|--------------|----------------|------------|---------------------------|-----------|-------------| +|Latency |Request Time |`nv_inference_request_duration_us` |Cumulative end-to-end inference request handling time (includes cached requests) |Per model |Per request | +| |Queue Time |`nv_inference_queue_duration_us` |Cumulative time requests spend waiting in the scheduling queue (includes cached requests) |Per model |Per request | +| |Compute Input Time|`nv_inference_compute_input_duration_us` |Cumulative time requests spend processing inference inputs (in the framework backend, does not include cached requests) |Per model |Per request | +| |Compute Time |`nv_inference_compute_infer_duration_us` |Cumulative time requests spend executing the inference model (in the framework backend, does not include cached requests) |Per model |Per request | +| |Compute Output Time|`nv_inference_compute_output_duration_us` |Cumulative time requests spend processing inference outputs (in the framework backend, does not include cached requests) |Per model |Per request | + +To disable these metrics specifically, you can set `--metrics-config counter_latencies=false` + +#### Summaries + +> **Note** +> +> The following Summary feature is experimental for the time being and may be +> subject to change based on user feedback. + +To get configurable quantiles over a sliding time window, Triton supports +a set a [Summary](https://prometheus.io/docs/concepts/metric_types/#summary) +metrics for latencies as well. These metrics are disabled by default, but can +be enabled by setting `--metrics-config summary_latencies=true`. + +For more information on how the quantiles are calculated, see +[this explanation](https://grafana.com/blog/2022/03/01/how-summary-metrics-work-in-prometheus/). + +The following summary metrics are available: + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|--------------|----------------|------------|---------------------------|-----------|-------------| +|Latency |Request Time |`nv_inference_request_summary_us` |Summary of end-to-end inference request handling times (includes cached requests) |Per model |Per request | +| |Queue Time |`nv_inference_queue_summary_us` |Summary of time requests spend waiting in the scheduling queue (includes cached requests) |Per model |Per request | +| |Compute Input Time|`nv_inference_compute_input_summary_us` |Summary time requests spend processing inference inputs (in the framework backend, does not include cached requests) |Per model |Per request | +| |Compute Time |`nv_inference_compute_infer_summary_us` |Summary of time requests spend executing the inference model (in the framework backend, does not include cached requests) |Per model |Per request | +| |Compute Output Time|`nv_inference_compute_output_summary_us` |Summary of time requests spend processing inference outputs (in the framework backend, does not include cached requests) |Per model |Per request | + +Each summary above is actually composed of several sub-metrics. For each +metric, there is a set of `quantile` metrics tracking the latency for each +quantile. Additionally, there are `_count` and `_sum` metrics that aggregate +the count and observed values for each. For example, see the following +information exposed by the Inference Queue Summary metrics: +``` +# HELP nv_inference_queue_summary_us Summary of inference queuing duration in microseconds (includes cached requests) +# TYPE nv_inference_queue_summary_us summary +nv_inference_queue_summary_us_count{model="my_model",version="1"} 161 +nv_inference_queue_summary_us_sum{model="my_model",version="1"} 11110 +nv_inference_queue_summary_us{model="my_model",version="1",quantile="0.5"} 55 +nv_inference_queue_summary_us{model="my_model",version="1",quantile="0.9"} 97 +nv_inference_queue_summary_us{model="my_model",version="1",quantile="0.95"} 98 +nv_inference_queue_summary_us{model="my_model",version="1",quantile="0.99"} 101 +nv_inference_queue_summary_us{model="my_model",version="1",quantile="0.999"} 101 +``` + +The count and sum for the summary above show that stats have been recorded for +161 requests, and took a combined total of 11110 microseconds. The `_count` and +`_sum` of a summary should generally match the counter metric equivalents when +applicable, such as: +``` +nv_inference_request_success{model="my_model",version="1"} 161 +nv_inference_queue_duration_us{model="my_model",version="1"} 11110 +``` + +Triton has a set of default quantiles to track, as shown above. To set +custom quantiles, you can use the `--metrics-config` CLI option. The format is: +``` +tritonserver --metrics-config summary_quantiles=":,...,:"` +``` + +For example: +``` +tritonserver --metrics-config summary_quantiles="0.5:0.05,0.9:0.01,0.95:0.001,0.99:0.001"` +``` + +To better understand the setting of error values for computing each quantile, see the +[best practices for histograms and summaries](https://prometheus.io/docs/practices/histograms/#histograms-and-summaries). + + +## GPU Metrics + +GPU metrics are collected through the use of [DCGM](https://developer.nvidia.com/dcgm). +Collection of GPU metrics can be toggled with the `--allow-gpu-metrics` CLI flag. +If building Triton locally, the `TRITON_ENABLE_METRICS_GPU` CMake build flag can be used to toggle building the relevant code entirely. + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|----------------|------------------|----------------------------|-------------------------------------------------------|-----------|-------------| +|GPU Utilization |Power Usage |`nv_gpu_power_usage` |GPU instantaneous power, in watts |Per GPU |Per interval | +| |Power Limit |`nv_gpu_power_limit` |Maximum GPU power limit, in watts |Per GPU |Per interval | +| |Energy Consumption|`nv_energy_consumption` |GPU energy consumption since Triton started, in joules |Per GPU |Per interval | +| |GPU Utilization |`nv_gpu_utilization` |GPU utilization rate (0.0 - 1.0) |Per GPU |Per interval | +|GPU Memory |GPU Total Memory |`nv_gpu_memory_total_bytes` |Total GPU memory, in bytes |Per GPU |Per interval | +| |GPU Used Memory |`nv_gpu_memory_used_bytes` |Used GPU memory, in bytes |Per GPU |Per interval | + + +## CPU Metrics + +Collection of CPU metrics can be toggled with the `--allow-cpu-metrics` CLI flag. +If building Triton locally, the `TRITON_ENABLE_METRICS_CPU` CMake build flag can be used to toggle building the relevant code entirely. + +> **Note** +> +> CPU Metrics are currently only supported on Linux. +> They collect information from the [/proc filesystem](https://www.kernel.org/doc/html/latest/filesystems/proc.html) such as `/proc/stat` and `/proc/meminfo`. + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|--------------|----------------|------------|---------------------------|-----------|-------------| +|CPU Utilization | CPU Utilization | `nv_cpu_utilization` | Total CPU utilization rate [0.0 - 1.0] | Aggregated across all cores since last interval | Per interval | +|CPU Memory | CPU Total Memory | `nv_cpu_memory_total_bytes` | Total CPU memory (RAM), in bytes | System-wide | Per interval | +| | CPU Used Memory | `nv_cpu_memory_used_bytes` | Used CPU memory (RAM), in bytes | System-wide | Per interval | + +## Pinned Memory Metrics + +Starting in 24.01, Triton offers Pinned Memory metrics to monitor the utilization of the Pinned Memory pool. + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|----------------|------------------|----------------------------|-------------------------------------------------------|-----------|-------------| +|Pinned Memory |Total Pinned memory |`nv_pinned_memory_pool_total_bytes` |Total Pinned memory, in bytes |All models |Per interval | +| |Used Pinned memory |`nv_pinned_memory_pool_used_bytes` |Used Pinned memory, in bytes |All models |Per interval | + +## Response Cache Metrics + +Cache metrics can be reported in two ways: + +1. A base set of cache metrics will be reported +by Triton directly, such as the cache hit/miss counts and durations described +below. + +2. As of 23.03, additional cache metrics may be reported depending on the +[cache implementation](response_cache.md#cache-implementations) +being used through Triton's [Metrics API](#custom-metrics). + +### Triton-reported Response Cache Metrics + +Compute latency metrics in the +[Inference Request Metrics table](#inference-request-metrics) above are +calculated for the time spent in model inference backends. If the response +cache is enabled for a given model (see [Response Cache](response_cache.md) +docs for more info), total inference times may be affected by response cache +lookup times. + +On cache hits, "Cache Hit Time" indicates the time spent looking up the +response, and "Compute Input Time" / "Compute Time" / "Compute Output Time" +are not recorded. + +On cache misses, "Cache Miss Time" indicates the time spent looking up +the request hash and inserting the computed output tensor data into the cache. +Otherwise, "Compute Input Time" / "Compute Time" / "Compute Output Time" will +be recorded as usual. + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|--------------|----------------|------------|---------------------------|-----------|-------------| +|Count |Cache Hit Count |`nv_cache_num_hits_per_model` |Number of response cache hits per model |Per model |Per request | +| |Cache Miss Count |`nv_cache_num_misses_per_model` |Number of response cache misses per model |Per model |Per request | +|Latency |Cache Hit Time |`nv_cache_hit_duration_per_model` |Cumulative time requests spend retrieving a cached response per model on cache hits (microseconds) |Per model |Per request | +| |Cache Miss Time |`nv_cache_miss_duration_per_model` |Cumulative time requests spend looking up and inserting responses into the cache on a cache miss (microseconds) |Per model |Per request | + +Similar to the Summaries section above for Inference Request Metrics, the +per-model cache hit/miss latency metrics also support Summaries. + +> **Note** +> +> For models with response caching enabled, the inference request **summary** metric +> is currently disabled. This is due to extra time spent internally on cache +> management that wouldn't be reflected correctly in the end to end request time. +> Other summary metrics are unaffected. + +## Custom Metrics + +Triton exposes a C API to allow users and backends to register and collect +custom metrics with the existing Triton metrics endpoint. The user takes the +ownership of the custom metrics created through the APIs and must manage their +lifetime following the API documentation. + +The +[identity_backend](https://github.com/triton-inference-server/identity_backend/blob/main/README.md#custom-metric-example) +demonstrates a practical example of adding a custom metric to a backend. + +Further documentation can be found in the `TRITONSERVER_MetricFamily*` and +`TRITONSERVER_Metric*` API annotations in +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). + +### TensorRT-LLM Backend Metrics + +The TRT-LLM backend uses the custom metrics API to track and expose specific metrics about +LLMs, KV Cache, and Inflight Batching to Triton: +https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics + +### vLLM Backend Metrics + +The vLLM backend uses the custom metrics API to track and expose specific metrics about +LLMs to Triton: +https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#triton-metrics diff --git a/docs/user_guide/model_analyzer.md b/docs/user_guide/model_analyzer.md new file mode 100644 index 0000000000..c4b606364b --- /dev/null +++ b/docs/user_guide/model_analyzer.md @@ -0,0 +1,45 @@ + + +# Model Analyzer + +The Triton [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) + is a tool that uses +[Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) +to send requests to your model while measuring GPU memory and compute +utilization. The Model Analyzer is specifically useful for characterizing the +GPU memory requirements for your model under different batching and model +instance configurations. Once you have this GPU memory usage information you can +more intelligently decide on how to combine multiple models on the same GPU +while remaining within the memory capacity of the GPU. + +For more detailed examples and explanations of using Model Analyzer, see: +- [Model Analyzer Conceptual Guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_3-optimizing_triton_configuration) +- [Maximizing Deep Learning +Inference Performance with NVIDIA Model +Analyzer](https://developer.nvidia.com/blog/maximizing-deep-learning-inference-performance-with-nvidia-model-analyzer) \ No newline at end of file diff --git a/docs/user_guide/model_configuration.md b/docs/user_guide/model_configuration.md new file mode 100644 index 0000000000..1b0e64a533 --- /dev/null +++ b/docs/user_guide/model_configuration.md @@ -0,0 +1,1230 @@ + + +# Model Configuration + +**Is this your first time writing a config file?** Check out +[this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_1-model_deployment#model-configuration) + or this +[example](https://github.com/triton-inference-server/tutorials/tree/main/HuggingFace#examples)! + +Each model in a [model repository](model_repository.md) must include a +model configuration that provides required and optional information +about the model. Typically, this configuration is provided in a +config.pbtxt file specified as [ModelConfig +protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto). +In some cases, discussed in [Auto-Generated Model +Configuration](#auto-generated-model-configuration), the model +configuration can be generated automatically by Triton and so does not +need to be provided explicitly. + +This section describes the most important model configuration +properties but the documentation in the [ModelConfig +protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +should also be consulted. + +## Minimal Model Configuration + +A minimal model configuration must specify the [*platform* and/or +*backend* +properties](https://github.com/triton-inference-server/backend/blob/main/README.md#backends), +the *max_batch_size* property, and the input and output tensors of the +model. + +As an example consider a TensorRT model that has two inputs, *input0* +and *input1*, and one output, *output0*, all of which are 16 entry +float32 tensors. The minimal configuration is: + +``` + platform: "tensorrt_plan" + max_batch_size: 8 + input [ + { + name: "input0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "input1" + data_type: TYPE_FP32 + dims: [ 16 ] + } + ] + output [ + { + name: "output0" + data_type: TYPE_FP32 + dims: [ 16 ] + } + ] +``` + +### Name, Platform and Backend + +The model configuration *name* property is optional. If the name of +the model is not specified in the configuration it is assumed to be +the same as the model repository directory containing the model. If +*name* is specified it must match the name of the model repository +directory containing the model. The required values for *platform* +and *backend* are described in the [backend +documentation](https://github.com/triton-inference-server/backend/blob/main/README.md#backends). + +### Model Transaction Policy + +The *model_transaction_policy* property describes the nature of +transactions expected from the model. + +#### Decoupled + +This boolean setting indicates whether responses generated by +the model are [decoupled](./decoupled_models.md) +with the requests issued to it. Using decoupled means the number of +responses generated by the model may differ from number of requests +issued, and the responses may be out of order relative to the order +of requests. The default is false, which means the model will +generate exactly one response for each request. + +### Maximum Batch Size + +The *max_batch_size* property indicates the maximum batch size that +the model supports for the [types of +batching](architecture.md#models-and-schedulers) that can be exploited +by Triton. If the model's batch dimension is the first dimension, and +all inputs and outputs to the model have this batch dimension, then +Triton can use its [dynamic batcher](#dynamic-batcher) or [sequence +batcher](#sequence-batcher) to automatically use batching with the +model. In this case *max_batch_size* should be set to a value +greater-or-equal-to 1 that indicates the maximum batch size that +Triton should use with the model. + +For models that do not support batching, or do not support batching in +the specific ways described above, *max_batch_size* must be set to +zero. + + +### Inputs and Outputs + +Each model input and output must specify a name, datatype, and shape. +The name specified for an input or output tensor must match the name +expected by the model. + +#### Special Conventions for PyTorch Backend + +**Naming Convention:** + +Due to the absence of sufficient metadata for inputs/outputs in TorchScript +model files, the "name" attribute of inputs/outputs in the configuration must +follow specific naming conventions. These are detailed below. + +1. [Only for Inputs] When the input is not a Dictionary of Tensors, the input +names in the configuration file should mirror the names of the input arguments to +the forward function in the model's definition. + +For example, if the forward function for the Torchscript model was defined as +`forward(self, input0, input1)`, the first and second inputs should be named +"input0" and "input1" respectively. + +2. `__`: Where \ can be any string and \ is an +integer index that refers to the position of the corresponding input/output. + +This means that if there are two inputs and two outputs, the first and second +inputs can be named "INPUT__0" and "INPUT__1" and the first and second outputs +can be named "OUTPUT__0" and "OUTPUT__1" respectively. + +3. If all inputs (or outputs) do not follow the same naming convention, then we +enforce strict ordering from the model configuration i.e. we assume the order of +inputs (or outputs) in the configuration is the true ordering of these inputs. + +***Dictionary of Tensors as Input:*** + +The PyTorch backend supports passing of inputs to the model in the form of a +Dictionary of Tensors. This is only supported when there is a *single* input to +the model of type Dictionary that contains a mapping of string to tensor. As an +example, if there is a model that expects the input of the form: + +``` +{'A': tensor1, 'B': tensor2} +``` + +The input names in the configuration in this case must not follow the above +naming conventions `__`. Instead, the names of the inputs in this +case must map to the string value 'key' for that specific tensor. For this case, +the inputs would be "A" and "B", where input "A" refers to value corresponding to +tensor1 and "B" refers to the value corresponding to tensor2. + +
+ +The datatypes allowed for input and output tensors varies based on the +type of the model. Section [Datatypes](#datatypes) describes the +allowed datatypes and how they map to the datatypes of each model +type. + +An input shape indicates the shape of an input tensor expected by the +model and by Triton in inference requests. An output shape indicates +the shape of an output tensor produced by the model and returned by +Triton in response to an inference request. Both input and output +shape must have rank greater-or-equal-to 1, that is, the empty shape +**[ ]** is not allowed. + +Input and output shapes are specified by a combination of +*max_batch_size* and the dimensions specified by the input or output +*dims* property. For models with *max_batch_size* greater-than 0, the +full shape is formed as [ -1 ] + *dims*. For models with +*max_batch_size* equal to 0, the full shape is formed as *dims*. For +example, for the following configuration the shape of "input0" is [ +-1, 16 ] and the shape of "output0" is [ -1, 4 ]. + +``` + platform: "tensorrt_plan" + max_batch_size: 8 + input [ + { + name: "input0" + data_type: TYPE_FP32 + dims: [ 16 ] + } + ] + output [ + { + name: "output0" + data_type: TYPE_FP32 + dims: [ 4 ] + } + ] +``` + +For a configuration that is identical except that *max_batch_size* +equal to 0, the shape of "input0" is [ 16 ] and the shape of "output0" +is [ 4 ]. + +``` + platform: "tensorrt_plan" + max_batch_size: 0 + input [ + { + name: "input0" + data_type: TYPE_FP32 + dims: [ 16 ] + } + ] + output [ + { + name: "output0" + data_type: TYPE_FP32 + dims: [ 4 ] + } + ] +``` + +For models that support input and output tensors with variable-size +dimensions, those dimensions can be listed as -1 in the input and +output configuration. For example, if a model requires a 2-dimensional +input tensor where the first dimension must be size 4 but the second +dimension can be any size, the model configuration for that input +would include *dims: [ 4, -1 ]*. Triton would then accept inference +requests where that input tensor's second dimension was any value +greater-or-equal-to 0. The model configuration can be more restrictive +than what is allowed by the underlying model. For example, even though +the framework model itself allows the second dimension to be any size, +the model configuration could be specified as *dims: [ 4, 4 ]*. In +this case, Triton would only accept inference requests where the input +tensor's shape was exactly *[ 4, 4 ]*. + +The [*reshape* property](#reshape) must be used if there is a mismatch +between the input shape that Triton receives in an inference request +and the input shape expected by the model. Similarly, the *reshape* +property must be used if there is a mismatch between the output shape +produced by the model and the shape that Triton returns in a response +to an inference request. + +Model inputs can specify `allow_ragged_batch` to indicate that the +input is a [ragged input](ragged_batching.md#ragged-batching). The field is +used with [dynamic batcher](#dynamic-batcher) to allow batching without +enforcing the input to have the same shape in all requests. + +## Auto-Generated Model Configuration + +The model configuration file containing the required +settings must be available with each model to be deployed +on Triton. In some cases the required portions of the model +configuration can be generated automatically by Triton. The +required portion of the model configuration are the settings +shown in the [Minimal Model Configuration](#minimal-model-configuration). +By default, Triton will try to complete these sections. However, +by starting Triton with `--disable-auto-complete-config` option, +Triton can be configured to not auto-complete model configuration +on the backend side. However, even with this option Triton will +fill in missing [`instance_group`](#instance-groups) settings with +default values. + +Triton can derive all the required settings automatically for +most of the TensorRT, TensorFlow saved-model, ONNX models, and OpenVINO models. +For Python models, [`auto_complete_config`](https://github.com/triton-inference-server/python_backend/#auto_complete_config) +function can be implemented in Python backend to provide +[`max_batch_size`](#maximum-batch-size), [`input`](#inputs-and-outputs) +and [`output`](#inputs-and-outputs) properties using `set_max_batch_size`, +`add_input`, and `add_output` functions. These properties will allow Triton +to load the Python model with [Minimal Model Configuration](#minimal-model-configuration) +in absence of a configuration file. +All other model types *must* provide a model configuration file. + +When developing a custom backend, you can populate required settings +in the configuration and call `TRITONBACKEND_ModelSetConfig` API to +update completed configuration with Triton core. You can take a +look at [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend) +and [Onnxruntime](https://github.com/triton-inference-server/onnxruntime_backend) +backends as examples of how to achieve this. Currently, only +[inputs, outputs](#inputs-and-outputs), [max_batch_size](#maximum-batch-size) +and [dynamic batching](#dynamic-batcher) settings can be populated by +backend. For custom backends, your config.pbtxt file must +include a `backend` field or your model name must be in the +form `.`. + +You can also see the model configuration generated for a model by +Triton using the [model configuration endpoint](../protocol/extension_model_configuration.md). The +easiest way to do this is to use a utility like *curl*: + +```bash +$ curl localhost:8000/v2/models//config +``` + +This will return a JSON representation of the generated model +configuration. From this you can take the max_batch_size, inputs, and +outputs sections of the JSON and convert it to a config.pbtxt file. +Triton only generates the [minimal portion of the model +configuration](#minimal-model-configuration). You must still provide +the optional portions of the model configuration by editing the +config.pbtxt file. + +## Custom Model Configuration + +Sometimes when multiple devices running Triton instances that share one +model repository, it is necessary to have models configured differently +on each platform in order to achieve the best performance. Triton allows +users to pick the custom model configuration name by setting `--model-config-name` option. + +For example, when running `./tritonserver --model-repository= --model-config-name=h100`, +the server will search the custom configuration file `h100.pbtxt` under +`/path/to/model/repository//configs` directory for each model +that is loaded. If `h100.pbtxt` exists, it will be used as the configuration +for this model. Otherwise, the default configuration `/path/to/model/repository//config.pbtxt` +or [auto-generated model configuration](#auto-generated-model-configuration) +will be selected based on the settings. + +Custom model configuration also works with `Explicit` and `Poll` model +control modes. Users may delete or add new custom configurations and the +server will pick the configuration file for each loaded model dynamically. + +Note: custom model configuration name should not contain any space character. + +Example 1: --model-config-name=h100 +``` +. +└── model_repository/ + ├── model_a/ + │ ├── configs/ + │ │ ├── v100.pbtxt + │ │ └── **h100.pbtxt** + │ └── config.pbtxt + ├── model_b/ + │ ├── configs/ + │ │ └── v100.pbtxt + │ └── **config.pbtxt** + └── model_c/ + ├── configs/ + │ └── config.pbtxt + └── **config.pbtxt** +``` + +Example 2: --model-config-name=config +``` +. +└── model_repository/ + ├── model_a/ + │ ├── configs/ + │ │ ├── v100.pbtxt + │ │ └── h100.pbtxt + │ └── **config.pbtxt** + ├── model_b/ + │ ├── configs/ + │ │ └── v100.pbtxt + │ └── **config.pbtxt** + └── model_c/ + ├── configs/ + │ └── **config.pbtxt** + └── config.pbtxt +``` + +Example 3: --model-config-name not set +``` +. +└── model_repository/ + ├── model_a/ + │ ├── configs/ + │ │ ├── v100.pbtxt + │ │ └── h100.pbtxt + │ └── **config.pbtxt** + ├── model_b/ + │ ├── configs/ + │ │ └── v100.pbtxt + │ └── **config.pbtxt** + └── model_c/ + ├── configs/ + │ └── config.pbtxt + └── **config.pbtxt** +``` + +### Default Max Batch Size and Dynamic Batcher + +When a model is using the auto-complete feature, a default maximum +batch size may be set by using the `--backend-config=default-max-batch-size=` +command line argument. This allows all models which are capable of +batching and which make use of [Auto Generated Model Configuration](#auto-generated-model-configuration) +to have a default maximum batch size. This value is set to 4 by +default. Backend developers may make use of this default-max-batch-size +by obtaining it from the TRITONBACKEND_BackendConfig api. Currently, the +following backends which utilize these default batch values and turn on +dynamic batching in their generated model configurations are: + +1. [TensorFlow backend](https://github.com/triton-inference-server/tensorflow_backend) +2. [Onnxruntime backend](https://github.com/triton-inference-server/onnxruntime_backend) +3. [TensorRT backend](https://github.com/triton-inference-server/tensorrt_backend) + 1. TensorRT models store the maximum batch size explicitly and do not make use + of the default-max-batch-size parameter. However, if max_batch_size > 1 + and no [scheduler](model_configuration.md#scheduling-and-batching) + is provided, the dynamic batch scheduler will be enabled. + +If a value greater than 1 for the maximum batch size is set for the +model, the [dynamic_batching](#dynamic-batcher) config will be set +if no scheduler is provided in the configuration file. + + +## Datatypes + +The following table shows the tensor datatypes supported by +Triton. The first column shows the name of the datatype as it appears +in the model configuration file. The next four columns show the +corresponding datatype for supported model frameworks. If a model +framework does not have an entry for a given datatype, then Triton +does not support that datatype for that model. The sixth column, +labeled "API", shows the corresponding datatype for the TRITONSERVER C +API, TRITONBACKEND C API, HTTP/REST protocol and GRPC protocol. The +last column shows the corresponding datatype for the Python numpy +library. + +|Model Config |TensorRT |TensorFlow |ONNX Runtime |PyTorch |API |NumPy | +|--------------|--------------|--------------|--------------|---------|---------|--------------| +|TYPE_BOOL | kBOOL |DT_BOOL |BOOL |kBool |BOOL |bool | +|TYPE_UINT8 | kUINT8 |DT_UINT8 |UINT8 |kByte |UINT8 |uint8 | +|TYPE_UINT16 | |DT_UINT16 |UINT16 | |UINT16 |uint16 | +|TYPE_UINT32 | |DT_UINT32 |UINT32 | |UINT32 |uint32 | +|TYPE_UINT64 | |DT_UINT64 |UINT64 | |UINT64 |uint64 | +|TYPE_INT8 | kINT8 |DT_INT8 |INT8 |kChar |INT8 |int8 | +|TYPE_INT16 | |DT_INT16 |INT16 |kShort |INT16 |int16 | +|TYPE_INT32 | kINT32 |DT_INT32 |INT32 |kInt |INT32 |int32 | +|TYPE_INT64 | kINT64 |DT_INT64 |INT64 |kLong |INT64 |int64 | +|TYPE_FP16 | kHALF |DT_HALF |FLOAT16 | |FP16 |float16 | +|TYPE_FP32 | kFLOAT |DT_FLOAT |FLOAT |kFloat |FP32 |float32 | +|TYPE_FP64 | |DT_DOUBLE |DOUBLE |kDouble |FP64 |float64 | +|TYPE_STRING | |DT_STRING |STRING | |BYTES |dtype(object) | +|TYPE_BF16 | kBF16 | | | |BF16 | | + +For TensorRT each value is in the nvinfer1::DataType namespace. For +example, nvinfer1::DataType::kFLOAT is the 32-bit floating-point +datatype. + +For TensorFlow each value is in the tensorflow namespace. For example, +tensorflow::DT_FLOAT is the 32-bit floating-point value. + +For ONNX Runtime each value is prepended with ONNX_TENSOR_ELEMENT_DATA_TYPE_. +For example, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT is the 32-bit floating-point +datatype. + +For PyTorch each value is in the torch namespace. For example, torch::kFloat +is the 32-bit floating-point datatype. + +For Numpy each value is in the numpy module. For example, numpy.float32 +is the 32-bit floating-point datatype. + +## Reshape + +The *ModelTensorReshape* property on a model configuration input or +output is used to indicate that the input or output shape accepted by +the inference API differs from the input or output shape expected or +produced by the underlying framework model or custom backend. + +For an input, *reshape* can be used to reshape the input tensor to a +different shape expected by the framework or backend. A common +use-case is where a model that supports batching expects a batched +input to have shape *[ batch-size ]*, which means that the batch +dimension fully describes the shape. For the inference API the +equivalent shape *[ batch-size, 1 ]* must be specified since each +input must specify a non-empty *dims*. For this case the input should +be specified as: + +``` + input [ + { + name: "in" + dims: [ 1 ] + reshape: { shape: [ ] } + } +``` + +For an output, *reshape* can be used to reshape the output tensor +produced by the framework or backend to a different shape that is +returned by the inference API. A common use-case is where a model that +supports batching expects a batched output to have shape *[ batch-size +]*, which means that the batch dimension fully describes the +shape. For the inference API the equivalent shape *[ batch-size, 1 ]* +must be specified since each output must specify a non-empty +*dims*. For this case the output should be specified as: + +``` + output [ + { + name: "in" + dims: [ 1 ] + reshape: { shape: [ ] } + } +``` + +## Shape Tensors + +For models that support shape tensors, the *is_shape_tensor* property +must be set appropriately for inputs and outputs that are acting as +shape tensors. The following shows an example configuration that +specifies shape tensors. + +``` + name: "myshapetensormodel" + platform: "tensorrt_plan" + max_batch_size: 8 + input [ + { + name: "input0" + data_type: TYPE_FP32 + dims: [ 1 , 3] + }, + { + name: "input1" + data_type: TYPE_INT32 + dims: [ 2 ] + is_shape_tensor: true + } + ] + output [ + { + name: "output0" + data_type: TYPE_FP32 + dims: [ 1 , 3] + } + ] +``` + +As discussed above, Triton assumes that batching occurs along the +first dimension which is not listed in in the input or output tensor +*dims*. However, for shape tensors, batching occurs at the first shape +value. For the above example, an inference request must provide inputs +with the following shapes. + +``` + "input0": [ x, 1, 3] + "input1": [ 3 ] + "output0": [ x, 1, 3] +``` + +Where *x* is the batch size of the request. Triton requires the shape +tensors to be marked as shape tensors in the model when using +batching. Note that "input1" has shape *[ 3 ]* and not *[ 2 ]*, which +is how it is described in model configuration. As `myshapetensormodel` +model is a batching model, the batch size should be provided as an +additional value. Triton will accumulate all the shape values together +for "input1" in batch dimension before issuing the request to model. + +For example, assume the client sends following three requests to Triton +with following inputs: + +``` +Request1: +input0: [[[1,2,3]]] <== shape of this tensor [1,1,3] +input1: [1,4,6] <== shape of this tensor [3] + +Request2: +input0: [[[4,5,6]], [[7,8,9]]] <== shape of this tensor [2,1,3] +input1: [2,4,6] <== shape of this tensor [3] + +Request3: +input0: [[[10,11,12]]] <== shape of this tensor [1,1,3] +input1: [1,4,6] <== shape of this tensor [3] +``` + +Assuming these requests get batched together would be delivered to the +model as: + + +``` +Batched Requests to model: +input0: [[[1,2,3]], [[4,5,6]], [[7,8,9]], [[10,11,12]]] <== shape of this tensor [4,1,3] +input1: [4, 4, 6] <== shape of this tensor [3] + +``` + +Currently, only TensorRT supports shape tensors. Read [Shape Tensor I/O](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#shape_tensor_io) +to learn more about shape tensors. + +## Non-Linear I/O Formats + +For models that process input or output data in non-linear formats, the _is_non_linear_format_io_ property +must be set. The following example model configuration shows how to specify that INPUT0 and INPUT1 use non-linear I/O data formats. + +``` + name: "mytensorrtmodel" + platform: "tensorrt_plan" + max_batch_size: 8 + input [ + { + name: "INPUT0" + data_type: TYPE_FP16 + dims: [ 3,224,224 ] + is_non_linear_format_io: true + }, + { + name: "INPUT1" + data_type: TYPE_FP16 + dims: [ 3,224,224 ] + is_non_linear_format_io: true + } + ] + output [ + { + name: "OUTPUT0" + data_type: TYPE_FP16 + dims: [ 1,3 ] + } + ] +``` + +Currently, only TensorRT supports this property. To learn more about I/O formats, refer to the [I/O Formats documentation](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#reformat-free-network-tensors). + +## Version Policy + +Each model can have one or more +[versions](model_repository.md#model-versions). The +*ModelVersionPolicy* property of the model configuration is used to +set one of the following policies. + +* *All*: All versions of the model that are available in the model + repository are available for inferencing. + ```version_policy: { all: {}}``` + +* *Latest*: Only the latest ‘n’ versions of the model in the + repository are available for inferencing. The latest versions of the + model are the numerically greatest version numbers. + ```version_policy: { latest: { num_versions: 2}}``` + +* *Specific*: Only the specifically listed versions of the model are + available for inferencing. + ```version_policy: { specific: { versions: [1,3]}}``` + +If no version policy is specified, then *Latest* (with n=1) is used as +the default, indicating that only the most recent version of the model +is made available by Triton. In all cases, the [addition or removal of +version subdirectories](model_management.md) from the model repository +can change which model version is used on subsequent inference +requests. + +The following configuration specifies that all versions of the model +will be available from the server. + +``` + platform: "tensorrt_plan" + max_batch_size: 8 + input [ + { + name: "input0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "input1" + data_type: TYPE_FP32 + dims: [ 16 ] + } + ] + output [ + { + name: "output0" + data_type: TYPE_FP32 + dims: [ 16 ] + } + ] + version_policy: { all { }} +``` + +## Instance Groups + +Triton can provide multiple [instances of a +model](architecture.md#concurrent-model-execution) so that multiple +inference requests for that model can be handled simultaneously. The +model configuration *ModelInstanceGroup* property is used to specify +the number of execution instances that should be made available and +what compute resource should be used for those instances. + +### Multiple Model Instances + +By default, a single execution instance of the model is created for +each GPU available in the system. The instance-group setting can be +used to place multiple execution instances of a model on every GPU or +on only certain GPUs. For example, the following configuration will +place two execution instances of the model to be available on each +system GPU. + +``` + instance_group [ + { + count: 2 + kind: KIND_GPU + } + ] +``` + +And the following configuration will place one execution instance on +GPU 0 and two execution instances on GPUs 1 and 2. + +``` + instance_group [ + { + count: 1 + kind: KIND_GPU + gpus: [ 0 ] + }, + { + count: 2 + kind: KIND_GPU + gpus: [ 1, 2 ] + } + ] +``` +For a more detailed example of using instance groups, see + [this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_2-improving_resource_utilization#concurrent-model-execution). +### CPU Model Instance + +The instance group setting is also used to enable execution of a model +on the CPU. A model can be executed on the CPU even if there is a GPU +available in the system. The following places two execution instances +on the CPU. + +``` + instance_group [ + { + count: 2 + kind: KIND_CPU + } + ] +``` + +If no `count` is specified for a KIND_CPU instance group, then the default instance +count will be 2 for selected backends (Tensorflow and Onnxruntime). All +other backends will default to 1. + +### Host Policy + +The instance group setting is associated with a host policy. The following +configuration will associate all instances created by the instance group setting +with host policy "policy_0". By default the host policy will be set according to +the device kind of the instance, for instance, KIND_CPU is "cpu", KIND_MODEL is +"model", and KIND_GPU is "gpu_\". + +``` + instance_group [ + { + count: 2 + kind: KIND_CPU + host_policy: "policy_0" + } + ] +``` + +### Rate Limiter Configuration + +Instance group optionally specifies [rate limiter](rate_limiter.md) +configuration which controls how the rate limiter operates on the +instances in the group. The rate limiter configuration is ignored if +rate limiting is off. If rate limiting is on and if an instance_group +does not provide this configuration, then the execution on the model +instances belonging to this group will not be limited in any way by +the rate limiter. The configuration includes the following +specifications: + +#### Resources + +The set of [resources](rate_limiter.md#resources) required to execute +a model instance. The "name" field identifies the resource and "count" +field refers to the number of copies of the resource that the model +instance in the group requires to run. The "global" field specifies +whether the resource is per-device or shared globally across the system. +Loaded models can not specify a resource with the same name both as global +and non-global. If no resources are provided then triton assumes the +execution of model instance does not require any resources and will +start executing as soon as model instance is available. + +#### Priority + +Priority serves as a weighting value to be used for prioritizing across +all the instances of all the models. An instance with priority 2 will be +given 1/2 the number of scheduling chances as an instance with priority +1. + +The following example specifies the instances in the group requires +four "R1" and two "R2" resources for execution. Resource "R2" is a global +resource. Additionally, the rate-limiter priority of the instance_group +is 2. + +``` + instance_group [ + { + count: 1 + kind: KIND_GPU + gpus: [ 0, 1, 2 ] + rate_limiter { + resources [ + { + name: "R1" + count: 4 + }, + { + name: "R2" + global: True + count: 2 + } + ] + priority: 2 + } + } + ] +``` + +The above configuration creates 3 model instances, one on each device +(0, 1 and 2). The three instances will not contend for "R1" among +themselves as "R1" is local for their own device, however, they will +contend for "R2" because it is specified as a global resource which +means "R2" is shared across the system. Though these instances don't +contend for "R1" among themselves, but they will contend for "R1" +with other model instances which includes "R1" in their resource +requirements and run on the same device as them. + +### Ensemble Model Instance Groups + +[Ensemble models](architecture.md#ensemble-models) +are an abstraction Triton uses to execute a user-defined pipeline of models. +Since there is no physical instance associated with an ensemble model, the +`instance_group` field can not be specified for it. + +However, each composing model that makes up an ensemble can specify +`instance_group` in its config file and individually support parallel +execution as described above when the ensemble receives multiple requests. + +## CUDA Compute Capability + +Similar to the `default_model_filename` field, you can optionally specify the +`cc_model_filenames` field to map the GPU's +[CUDA Compute Capability](https://developer.nvidia.com/cuda-gpus) +to a corresponding model filename at model load time. This is particularly +useful for TensorRT models, since they are generally tied to a specific +compute capability. + +``` +cc_model_filenames [ + { + key: "7.5" + value: "resnet50_T4.plan" + }, + { + key: "8.0" + value: "resnet50_A100.plan" + } +] +``` + +## Scheduling And Batching + +Triton supports batch inferencing by allowing individual inference +requests to specify a batch of inputs. The inferencing for a batch of +inputs is performed at the same time which is especially important for +GPUs since it can greatly increase inferencing throughput. In many use +cases the individual inference requests are not batched, therefore, +they do not benefit from the throughput benefits of batching. + +The inference server contains multiple scheduling and batching +algorithms that support many different model types and use-cases. More +information about model types and schedulers can be found in [Models +And Schedulers](architecture.md#models-and-schedulers). + +### Default Scheduler + +The default scheduler is used for a model if none of the +*scheduling_choice* properties are specified in the model +configuration. The default scheduler simply distributes inference +requests to all [model instances](#instance-groups) configured for the +model. + +### Dynamic Batcher + +Dynamic batching is a feature of Triton that allows inference requests +to be combined by the server, so that a batch is created +dynamically. Creating a batch of requests typically results in +increased throughput. The dynamic batcher should be used for +[stateless models](architecture.md#stateless-models). The dynamically created +batches are distributed to all [model instances](#instance-groups) +configured for the model. + +Dynamic batching is enabled and configured independently for each +model using the *ModelDynamicBatching* property in the model +configuration. These settings control the preferred size(s) of the +dynamically created batches, the maximum time that requests can be +delayed in the scheduler to allow other requests to join the dynamic +batch, and queue properties such a queue size, priorities, and +time-outs. Refer to +[this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_2-improving_resource_utilization#what-is-dynamic-batching) +for a more detailed example of dynamic batching. + +#### Recommended Configuration Process + +The individual settings are described in detail below. The following +steps are the recommended process for tuning the dynamic batcher for +each model. It is also possible to use the [Model +Analyzer](model_analyzer.md) to automatically search across different +dynamic batcher configurations. + +* Decide on a [maximum batch size](#maximum-batch-size) for the model. + +* Add the following to the model configuration to enable the dynamic + batcher with all default settings. By default the dynamic batcher + will create batches as large as possible up to the maximum batch + size and will not [delay](#delayed-batching) when forming batches. + +``` + dynamic_batching { } +``` + +* Use the + [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) + to determine the latency and throughput provided by the default dynamic + batcher configuration. + +* If the default configuration results in latency values that are + within your latency budget, try one or both of the following to + trade off increased latency for increased throughput: + + * Increase maximum batch size. + + * Set [batch delay](#delayed-batching) to a non-zero value. Try + increasing delay values until the latency budget is exceeded to + see the impact on throughput. + +* [Preferred batch sizes](#preferred-batch-sizes) should not be used + for most models. A preferred batch size(s) should only be configured + if that batch size results in significantly higher performance than + other batch sizes. + +#### Preferred Batch Sizes + +The *preferred_batch_size* property indicates the batch sizes that the +dynamic batcher should attempt to create. For most models, +*preferred_batch_size* should not be specified, as described in +[Recommended Configuration +Process](#recommended-configuration-process). An exception is TensorRT +models that specify multiple optimization profiles for different batch +sizes. In this case, because some optimization profiles may give +significant performance improvement compared to others, it may make +sense to use *preferred_batch_size* for the batch sizes supported by +those higher-performance optimization profiles. + +The following example shows the configuration that enables dynamic +batching with preferred batch sizes of 4 and 8. + +``` + dynamic_batching { + preferred_batch_size: [ 4, 8 ] + } +``` + +When a model instance becomes available for inferencing, the dynamic +batcher will attempt to create batches from the requests that are +available in the scheduler. Requests are added to the batch in the +order the requests were received. If the dynamic batcher can form a +batch of a preferred size(s) it will create a batch of the largest +possible preferred size and send it for inferencing. If the dynamic +batcher cannot form a batch of a preferred size (or if the dynamic +batcher is not configured with any preferred batch sizes), it will +send a batch of the largest size possible that is less than the +maximum batch size allowed by the model (but see the following section +for the delay option that changes this behavior). + +The size of generated batches can be examined in aggregate using +[count metrics](metrics.md#inference-request-metrics). + +#### Delayed Batching + +The dynamic batcher can be configured to allow requests to be delayed +for a limited time in the scheduler to allow other requests to join +the dynamic batch. For example, the following configuration sets the +maximum delay time of 100 microseconds for a request. + +``` + dynamic_batching { + max_queue_delay_microseconds: 100 + } +``` + +The *max_queue_delay_microseconds* property setting changes the +dynamic batcher behavior when a maximum size (or preferred size) batch +cannot be created. When a batch of a maximum or preferred size cannot +be created from the available requests, the dynamic batcher will delay +sending the batch as long as no request is delayed longer than the +configured *max_queue_delay_microseconds* value. If a new request +arrives during this delay and allows the dynamic batcher to form a +batch of a maximum or preferred batch size, then that batch is sent +immediately for inferencing. If the delay expires the dynamic batcher +sends the batch as is, even though it is not a maximum or preferred +size. + +#### Preserve Ordering + +The *preserve_ordering* property is used to force all responses to be +returned in the same order as requests were received. See the +[protobuf +documentation](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +for details. + +#### Priority Levels + +By default the dynamic batcher maintains a single queue that holds all +inference requests for a model. The requests are processed and batched +in order. The *priority_levels* property can be used to create +multiple priority levels within the dynamic batcher so that requests +with higher priority are allowed to bypass requests with lower +priority. Requests at the same priority level are processed in +order. Inference requests that do not set a priority are scheduled +using the *default_priority_level* property. + +#### Queue Policy + +The dynamic batcher provides several settings that control how +requests are queued for batching. + +When *priority_levels* is not defined, the *ModelQueuePolicy* for the +single queue can be set with *default_queue_policy*. When +*priority_levels* is defined, each priority level can have a different +*ModelQueuePolicy* as specified by *default_queue_policy* and *priority_queue_policy*. + +The *ModelQueuePolicy* property allows a maximum queue size to be set +using the *max_queue_size*. The *timeout_action*, +*default_timeout_microseconds* and *allow_timeout_override* settings +allow the queue to be configured so that individual requests are +rejected or deferred if their time in the queue exceeds a specified +timeout. + +#### Custom Batching + +You can set custom batching rules that work _in addition to_ the specified behavior of the dynamic batcher. +To do so, you would implement five functions in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h) +and create a shared library. These functions are described below. + +| Function | Description| +| :-- | :-- | +| TRITONBACKEND_ModelBatchIncludeRequest | Determines whether a request should be included in the current batch | +| TRITONBACKEND_ModelBatchInitialize | Initializes a record-keeping data structure for a new batch | +| TRITONBACKEND_ModelBatchFinalize | Deallocates the record-keeping data structure after a batch is formed | +| TRITONBACKEND_ModelBatcherInitialize | Initializes a read-only data structure for use with all batches | +| TRITONBACKEND_ModelBatcherFinalize | Deallocates the read-only data structure after the model is unloaded | + +The path to the shared library can be passed into the model configuration via the parameter +`TRITON_BATCH_STRATEGY_PATH`. If not provided, the dynamic batcher will look for a custom +batching strategy named batchstrategy.so in the model version, model, and backend directories, +in that order. If found, it will load it. This lets you easily share a custom batching strategy +among all models using the same backend. + +For a tutorial of how to create and use a custom batching library, please see the +[backend examples directory](https://github.com/triton-inference-server/backend/tree/main/examples#volume-batching). + +### Sequence Batcher + +Like the dynamic batcher, the sequence batcher combines non-batched +inference requests, so that a batch is created dynamically. Unlike the +dynamic batcher, the sequence batcher should be used for +[stateful models](architecture.md#stateful-models) where a sequence of +inference requests must be routed to the same model instance. The +dynamically created batches are distributed to all [model +instances](#instance-groups) configured for the model. + +Sequence batching is enabled and configured independently for each +model using the *ModelSequenceBatching* property in the model +configuration. These settings control the sequence timeout as well as +configuring how Triton will send control signals to the model +indicating sequence start, end, ready and correlation ID. See +[Stateful Models](architecture.md#stateful-models) for more +information and examples. + +#### Iterative Sequences + +> [!NOTE] +> Iterative sequences are *provisional* and likely to change in future versions. + +The sequence batcher supports stateful execution of "iterative +sequences" where a single request is processed over a number of +scheduling iterations. "Iterative sequences" enable the scheduler to +batch multiple inflight requests at each step and allow the model or +backend to complete a request at any iteration. + +For models and backends that support "iterative sequences", users can +enable support in the sequence batcher by specifying: + +``` + sequence_batching { + iterative_sequence: true + } +``` + +An "iterative sequence" refers to stateful models that iteratively +process a single request until a complete response is generated. When +iterative sequence is enabled, the sequence scheduler will expect a +single incoming request to initiate the sequence. Backends that +support iterative sequences can then yield back to the sequence +batcher to reschedule the request for further execution in a future +batch. + +Because only one request is used to represent the "iterative +sequence", the user doesn't need to set [control +inputs](architecture.md#control-inputs) mentioned in the previous +section. They will be filled internally by the scheduler. + +"Iterative sequences" can be [decoupled](#decoupled) where more than +one response can be generated during execution or non-decoupled where +a single response is generated when the full response is complete. + +The main advantage of "iterative sequences" is the ability to use +Triton's native batching capabilities to form batches of requests at +different iteration stages without having to maintain additional state +in the backend. Typically batches executed by backends are completed +in the same execution which can waste resources if the execution of +one of the requests in the batch takes much longer than the rest. With +"iterative sequences", processing for each request in a batch can be +broken down into multiple iterations and a backend can start +processing new requests as soon as any request is complete. + +##### Continuous/Inflight Batching with Iterative Sequences + +Continuous batching, iteration level batching, and inflight batching +are terms used in large language model (LLM) inferencing to describe +batching strategies that form batches of requests at each iteration +step. By forming batches "continuously" inference servers can increase +throughput by reusing batch slots as soon as they are free without +waiting for all requests in a batch to complete. + +As the number of steps required to process a request can vary +significantly, batching existing requests and new requests continuously +can have a significant improvement on throughput and latency. + +To achieve inflight batching with iterative sequences, the backend +should break request processing into a number of steps, where each +step corresponds to one Triton model instance execution. At the end of +each step, the model instance will release requests that have been +completed and reschedule requests that are still inflight. Triton will +then form and schedule the next batch of requests that mixes new and +rescheduled requests. + +### Ensemble Scheduler + +The ensemble scheduler must be used for [ensemble + models](architecture.md#ensemble-models) and cannot be used for any + other type of model. + +The ensemble scheduler is enabled and configured independently for +each model using the *ModelEnsembleScheduling* property in the model +configuration. The settings describe the models that are included in +the ensemble and the flow of tensor values between the models. See +[Ensemble Models](architecture.md#ensemble-models) for more +information and examples. + +## Optimization Policy + +The model configuration *ModelOptimizationPolicy* property is used to +specify optimization and prioritization settings for a model. These +settings control if/how a model is optimized by the backend and how it +is scheduled and executed by Triton. See the [ModelConfig +protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +and [optimization](optimization.md#framework-specific-optimization) +documentation for the currently available settings. + +## Model Warmup + +When a model is loaded by Triton the corresponding +[backend](https://github.com/triton-inference-server/backend/blob/main/README.md) +initializes for that model. For some backends, some or all of this +initialization is deferred until the model receives its first +inference request (or first few inference requests). As a result, the +first (few) inference requests can be significantly slower due to +deferred initialization. + +To avoid these initial, slow inference requests, Triton provides a +configuration option that enables a model to be "warmed up" so that it +is completely initialized before the first inference request is +received. When the *ModelWarmup* property is defined in a model +configuration, Triton will not show the model as being ready for +inference until model warmup has completed. + +The model configuration *ModelWarmup* is used to specify warmup +settings for a model. The settings define a series of inference +requests that Triton will create to warm-up each model instance. A +model instance will be served only if it completes the requests +successfully. Note that the effect of warming up models varies +depending on the framework backend, and it will cause Triton to be +less responsive to model update, so the users should experiment and +choose the configuration that suits their need. See the +[ModelWarmup protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +documentation for the currently available settings, and +[L0_warmup](https://github.com/triton-inference-server/server/blob/main/qa/L0_warmup/test.sh) +for examples on specifying different variants of warmup samples. + +## Response Cache + +The model configuration `response_cache` section has an `enable` boolean used to +enable the Response Cache for this model. + +``` +response_cache { + enable: true +} +``` + +In addition to enabling the cache in the model config, a `--cache-config` must +be specified when starting the server to enable caching on the server-side. See +the [Response Cache](response_cache.md) doc for more details on enabling +server-side caching. diff --git a/docs/user_guide/model_management.md b/docs/user_guide/model_management.md new file mode 100644 index 0000000000..4ce698feee --- /dev/null +++ b/docs/user_guide/model_management.md @@ -0,0 +1,250 @@ + + +# Model Management + +Triton provides model management APIs are part of the [HTTP/REST and +GRPC protocols, and as part of the C +API](../customization_guide/inference_protocols.md). Triton operates in one of three model +control modes: NONE, EXPLICIT or POLL. The model control mode +determines how changes to the model repository are handled by Triton +and which of these protocols and APIs are available. + +## Model Control Mode NONE + +Triton attempts to load all models in the model repository at +startup. Models that Triton is not able to load will be marked as +UNAVAILABLE and will not be available for inferencing. + +Changes to the model repository while the server is running will be +ignored. Model load and unload requests using the [model control +protocol](../protocol/extension_model_repository.md) will have no affect +and will return an error response. + +This model control mode is selected by specifying +`--model-control-mode=none` when starting Triton. This is the default +model control mode. Changing the model repository while Triton is +running must be done carefully, as explained in [Modifying the Model +Repository](#modifying-the-model-repository). + +## Model Control Mode EXPLICIT + +At startup, Triton loads only those models specified explicitly with the +`--load-model` command-line option. To load ALL models at startup, specify +`--load-model=*` as the ONLY `--load-model` argument. Specifying +`--load-model=*` in conjunction with another `--load-model` argument will +result in error. If `--load-model` is not specified then no models are loaded +at startup. Models that Triton is not able to load will be marked as +UNAVAILABLE and will not be available for inferencing. + +After startup, all model load and unload actions must be initiated +explicitly by using the [model control +protocol](../protocol/extension_model_repository.md). The response +status of the model control request indicates success or failure of +the load or unload action. When attempting to reload an already loaded +model, if the reload fails for any reason the already loaded model +will be unchanged and will remain loaded. If the reload succeeds, the +newly loaded model will replace the already loaded model without any +loss in availability for the model. + +This model control mode is enabled by specifying +`--model-control-mode=explicit`. Changing the model repository while +Triton is running must be done carefully, as explained in [Modifying +the Model Repository](#modifying-the-model-repository). + +If you are seeing some memory growth when using the [model control +protocol](../protocol/extension_model_repository.md) for loading and unloading +models, it is possible that it's not an actual memory leak but some system's +malloc heuristics that causes memory to be unable to be released back to the OS +right away. To improve memory performance, you can consider switching from +malloc to [tcmalloc](https://github.com/google/tcmalloc) or +[jemalloc](https://github.com/jemalloc/jemalloc) by setting the `LD_PRELOAD` +environment variable when running Triton, as shown below: +``` +# Using tcmalloc +LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libtcmalloc.so.4:${LD_PRELOAD} tritonserver --model-repository=/models ... +``` +``` +# Using jemalloc +LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so:${LD_PRELOAD} tritonserver --model-repository=/models ... +``` +We recommend experimenting with both tcmalloc and jemalloc to determine which +one works better for your use case, as they have different strategies for +memory allocation and deallocation and may perform differently depending on the +workload. + +Both tcmalloc and jemalloc libraries are already installed within the Triton +container. However, if you need to install them, you can do so using the +following commands: +``` +# Install tcmalloc +apt-get install gperf libgoogle-perftools-dev +``` +``` +# Install jemalloc +apt-get install libjemalloc-dev +``` + +## Model Control Mode POLL + +Triton attempts to load all models in the model repository at +startup. Models that Triton is not able to load will be marked as +UNAVAILABLE and will not be available for inferencing. + +Changes to the model repository will be detected and Triton will +attempt to load and unload models as necessary based on those changes. +When attempting to reload an already loaded model, if the reload fails +for any reason the already loaded model will be unchanged and will +remain loaded. If the reload succeeds, the newly loaded model will +replace the already loaded model without any loss of availability for +the model. + +Changes to the model repository may not be detected immediately +because Triton polls the repository periodically. You can control the +polling interval with the `--repository-poll-secs` option. The console +log or the [model ready +protocol](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md) +or the index operation of the [model control +protocol](../protocol/extension_model_repository.md) can be used to +determine when model repository changes have taken effect. + +**WARNING: There is no synchronization between when Triton polls the +model repository and when you make any changes to the repository. As a +result Triton could observe partial and incomplete changes that lead +to unexpected behavior. For this reason POLL mode is not recommended +for use in production environments.** + +Model load and unload requests using the [model control +protocol](../protocol/extension_model_repository.md) will have no affect +and will return an error response. + +This model control mode is enabled by specifying +`--model-control-mode=poll` and by setting `--repository-poll-secs` to a +non-zero value when starting Triton. Changing the model repository +while Triton is running must be done carefully, as explained in +[Modifying the Model Repository](#modifying-the-model-repository). + +In POLL mode Triton responds to the following model repository +changes: + +* Versions may be added and removed from models by adding and removing + the corresponding version subdirectory. Triton will allow in-flight + requests to complete even if they are using a removed version of the + model. New requests for a removed model version will fail. Depending + on the model's [version + policy](model_configuration.md#version-policy), changes to the + available versions may change which model version is served by + default. + +* Existing models can be removed from the repository by removing the + corresponding model directory. Triton will allow in-flight requests + to any version of the removed model to complete. New requests for a + removed model will fail. + +* New models can be added to the repository by adding a new model + directory. + +* The [model configuration file](model_configuration.md) + (config.pbtxt) can be changed and Triton will unload and reload the + model to pick up the new model configuration. + +* Label(s) files providing labels for outputs that represent + classifications can be added, removed, or modified and Triton will + unload and reload the model to pick up the new labels. If a label + file is added or removed the corresponding edit to the + *label_filename* property of the output it corresponds to in the + [model configuration](model_configuration.md) must be performed at + the same time. + +## Modifying the Model Repository + +Each model in a model repository [resides in its own +sub-directory](model_repository.md#repository-layout). The activity +allowed on the contents of a model's sub-directory varies depending on +how Triton is using that model. The state of a model can be determined +by using the [model +metadata](../customization_guide/inference_protocols.md#inference-protocols-and-apis) or +[repository index](../protocol/extension_model_repository.md#index) APIs. + +* If the model is actively loading or unloading, no files or +directories within that sub-directory must be added, removed or +modified. + +* If the model has never been loaded or has been completely unloaded, + then the entire model sub-directory can be removed or any of its + contents can be added, removed or modified. + +* If the model has been completely loaded then any files or +directories within that sub-directory can be added, removed or +modified; except for shared libraries implementing the model's +backend. Triton uses the backend shared libraries while the model is +loading so removing or modifying them will likely cause Triton to +crash. To update a model's backend you must first unload the model +completely, modify the backend shared libraries, and then reload the +model. On some OSes it may also be possible to simply move the +existing shared-libraries to another location outside of the model +repository, copy in the new shared libraries, and then reload the +model. + +* If only the model instance configuration on the 'config.pbtxt' is modified +(i.e. increasing/decreasing the instance count), then Triton will update the +model rather then reloading it, when either a load request is received under +[Model Control Mode EXPLICIT](#model-control-mode-explicit) or change to the +'config.pbtxt' is detected under +[Model Control Mode POLL](#model-control-mode-poll). + * The new model configuration may also be passed to Triton via the +[load API](../protocol/extension_model_repository.md#load). + * Some text editors create a swap file in the model directory when the +'config.pbtxt' is modified in place. The swap file is not part of the model +configuration, so its presence in the model directory may be detected as a new file +and cause the model to fully reload when only an update is expected. + +* If a sequence model is *updated* (i.e. decreasing the instance count), Triton +will wait until the in-flight sequence is completed (or timed-out) before the +instance behind the sequence is removed. + * If the instance count is decreased, arbitrary instance(s) are selected among +idle instances and instances with in-flight sequence(s) for removal. + +* If a sequence model is *reloaded* with in-flight sequence(s) (i.e. changes to +the model file), Triton does not guarantee any remaining request(s) from the +in-flight sequence(s) will be routed to the same model instance for processing. +It is currently the responsibility of the user to ensure any in-flight +sequence(s) are completed before reloading a sequence model. + +## Concurrently Loading Models + +To reduce service downtime, Triton loads new models in the background while +continuing to serve inferences on existing models. Based on use case and +performance requirements, the optimal amount of resources dedicated to loading +models may differ. Triton exposes a `--model-load-thread-count` option to +configure the number of threads dedicated to loading models, which defaults to 4. + +To set this parameter with the C API, refer to +`TRITONSERVER_ServerOptionsSetModelLoadThreadCount` in +[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). + diff --git a/docs/user_guide/model_repository.md b/docs/user_guide/model_repository.md new file mode 100644 index 0000000000..952334dadd --- /dev/null +++ b/docs/user_guide/model_repository.md @@ -0,0 +1,513 @@ + + +# Model Repository + +**Is this your first time setting up a model repository?** Check out +[these tutorials](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_1-model_deployment#setting-up-the-model-repository) + to begin your Triton journey! + +The Triton Inference Server serves models from one or more model +repositories that are specified when the server is started. While +Triton is running, the models being served can be modified as +described in [Model Management](model_management.md). + +## Repository Layout + +These repository paths are specified when Triton is started using the +--model-repository option. The --model-repository option can be +specified multiple times to included models from multiple +repositories. The directories and files that compose a model +repository must follow a required layout. Assuming a repository path +is specified as follows. + +```bash +$ tritonserver --model-repository= +``` + +The corresponding repository layout must be: + +``` + / + / + [config.pbtxt] + [ ...] + [configs]/ + [ ...] + / + + / + + ... + / + [config.pbtxt] + [ ...] + [configs]/ + [ ...] + / + + / + + ... + ... +``` + +Within the top-level model repository directory there must be zero or +more sub-directories. Each of the +sub-directories contains the repository information for the +corresponding model. The config.pbtxt file describes the [model +configuration](model_configuration.md) for the model. For some models, +config.pbtxt is required while for others it is optional. See +[Auto-Generated Model +Configuration](model_configuration.md#auto-generated-model-configuration) +for more information. + +Each directory may include an optional sub-directory configs. +Within the configs directory there must be zero or more +with .pbtxt file extension. For more information about how the custom model +configuration is handled by Triton see [Custom Model Configuration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#custom-model-configuration). + +Each directory must have at least one numeric +sub-directory representing a version of the model. For more +information about how the model versions are handled by Triton see +[Model Versions](#model-versions). Each model is executed by a +specific +[backend](https://github.com/triton-inference-server/backend/blob/main/README.md). +Within each version sub-directory there must be the files required by +that backend. For example, models that use framework backends such as +TensorRT, PyTorch, ONNX, OpenVINO and TensorFlow must provide the +[framework-specific model files](#model-files). + +## Model Repository Locations + +Triton can access models from one or more locally accessible file +paths, from Google Cloud Storage, from Amazon S3, and from Azure +Storage. + +### Local File System + +For a locally accessible file-system the absolute path must be +specified. + +```bash +$ tritonserver --model-repository=/path/to/model/repository ... +``` + +### Cloud Storage with Environment variables + +#### Google Cloud Storage + +For a model repository residing in Google Cloud Storage, the +repository path must be prefixed with gs://. + +```bash +$ tritonserver --model-repository=gs://bucket/path/to/model/repository ... +``` + +When using Google Cloud Storage, credentials are fetched and attempted in the +following order: +1. [GOOGLE_APPLICATION_CREDENTIALS environment variable](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) + - The environment variable should be set and contains the location of a +credential JSON file. + - Authorized user credential will be attempted first, and then service +account credential. +2. [The attached service account](https://cloud.google.com/docs/authentication/application-default-credentials#attached-sa) + - A value for the +[Authorization HTTP header](https://googleapis.dev/cpp/google-cloud-storage/1.42.0/classgoogle_1_1cloud_1_1storage_1_1oauth2_1_1ComputeEngineCredentials.html#a8c3a5d405366523e2f4df06554f0a676) +should be obtainable. +3. Anonymous credential (also known as public bucket) + - The bucket (and objects) should have granted `get` and `list` permission to +all users. + - One way to grant such permission is by adding both +[storage.objectViewer](https://cloud.google.com/storage/docs/access-control/iam-roles#standard-roles) +and +[storage.legacyBucketReader](https://cloud.google.com/storage/docs/access-control/iam-roles#legacy-roles) +predefined roles for "allUsers" to the bucket, for example: + ``` + $ gsutil iam ch allUsers:objectViewer "${BUCKET_URL}" + $ gsutil iam ch allUsers:legacyBucketReader "${BUCKET_URL}" + ``` + +By default, Triton makes a local copy of a remote model repository in +a temporary folder, which is deleted after Triton server is shut down. +If you would like to control where remote model repository is copied to, +you may set the `TRITON_GCS_MOUNT_DIRECTORY` environment variable to +a path pointing to the existing folder on your local machine. + +```bash +export TRITON_GCS_MOUNT_DIRECTORY=/path/to/your/local/directory +``` + +**Make sure, that `TRITON_GCS_MOUNT_DIRECTORY` exists on your local machine +and it is empty.** + +#### S3 + +For a model repository residing in Amazon S3, the path must be +prefixed with s3://. + +```bash +$ tritonserver --model-repository=s3://bucket/path/to/model/repository ... +``` + +For a local or private instance of S3, the prefix s3:// must be +followed by the host and port (separated by a semicolon) and +subsequently the bucket path. + +```bash +$ tritonserver --model-repository=s3://host:port/bucket/path/to/model/repository ... +``` + +By default, Triton uses HTTP to communicate with your instance of S3. If +your instance of S3 supports HTTPS and you wish for Triton to use the HTTPS +protocol to communicate with it, you can specify the same in the model +repository path by prefixing the host name with https://. + +```bash +$ tritonserver --model-repository=s3://https://host:port/bucket/path/to/model/repository ... +``` + +When using S3, the credentials and default region can be passed by +using either the [aws +config](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) +command or via the respective [environment +variables](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html). +If the environment variables are set they will take a higher priority +and will be used by Triton instead of the credentials set using the +aws config command. + +By default, Triton makes a local copy of a remote model repository +in a temporary folder, which is deleted after Triton server is shut down. +If you would like to control where remote model repository is copied to, +you may set the `TRITON_AWS_MOUNT_DIRECTORY` environment variable to +a path pointing to the existing folder on your local machine. + +```bash +export TRITON_AWS_MOUNT_DIRECTORY=/path/to/your/local/directory +``` + +**Make sure, that `TRITON_AWS_MOUNT_DIRECTORY` exists on your local machine +and it is empty.** + +#### Azure Storage + +For a model repository residing in Azure Storage, the repository path +must be prefixed with as://. + +```bash +$ tritonserver --model-repository=as://account_name/container_name/path/to/model/repository ... +``` + +When using Azure Storage, you must set the `AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_KEY` +environment variables to an account that has access to the Azure Storage repository. + +If you don't know your `AZURE_STORAGE_KEY` and have your Azure CLI correctly configured, +here's an example of how to find a key corresponding to your `AZURE_STORAGE_ACCOUNT`: + +```bash +$ export AZURE_STORAGE_ACCOUNT="account_name" +$ export AZURE_STORAGE_KEY=$(az storage account keys list -n $AZURE_STORAGE_ACCOUNT --query "[0].value") +``` +By default, Triton makes a local copy of a remote model repository in +a temporary folder, which is deleted after Triton server is shut down. +If you would like to control where remote model repository is copied to, +you may set the `TRITON_AZURE_MOUNT_DIRECTORY` environment variable to a path +pointing to the existing folder on your local machine. + +```bash +export TRITON_AZURE_MOUNT_DIRECTORY=/path/to/your/local/directory +``` + +**Make sure, that `TRITON_AZURE_MOUNT_DIRECTORY` exists on your local machine +and it is empty.** + + +### Cloud Storage with Credential file (Beta) + +*This feature is currently in beta and may be subject to change.* + +To group the credentials into a single file for Triton, you may set the +`TRITON_CLOUD_CREDENTIAL_PATH` environment variable to a path pointing to a +JSON file of the following format, residing in the local file system. + +``` +export TRITON_CLOUD_CREDENTIAL_PATH="cloud_credential.json" +``` + +"cloud_credential.json": +``` +{ + "gs": { + "": "PATH_TO_GOOGLE_APPLICATION_CREDENTIALS", + "gs://gcs-bucket-002": "PATH_TO_GOOGLE_APPLICATION_CREDENTIALS_2" + }, + "s3": { + "": { + "secret_key": "AWS_SECRET_ACCESS_KEY", + "key_id": "AWS_ACCESS_KEY_ID", + "region": "AWS_DEFAULT_REGION", + "session_token": "", + "profile": "" + }, + "s3://s3-bucket-002": { + "secret_key": "AWS_SECRET_ACCESS_KEY_2", + "key_id": "AWS_ACCESS_KEY_ID_2", + "region": "AWS_DEFAULT_REGION_2", + "session_token": "AWS_SESSION_TOKEN_2", + "profile": "AWS_PROFILE_2" + } + }, + "as": { + "": { + "account_str": "AZURE_STORAGE_ACCOUNT", + "account_key": "AZURE_STORAGE_KEY" + }, + "as://Account-002/Container": { + "account_str": "", + "account_key": "" + } + } +} +``` + +To match a credential, the longest matching credential name against the start +of a given path is used. For example: `gs://gcs-bucket-002/model_repository` +will match the "gs://gcs-bucket-002" GCS credential, and +`gs://any-other-gcs-bucket` will match the "" GCS credential. + +This feature is intended for use-cases which multiple credentials are needed +for each cloud storage provider. Be sure to replace any credential paths/keys +with the actual paths/keys from the example above. + +If the `TRITON_CLOUD_CREDENTIAL_PATH` environment variable is not set, the +[Cloud Storage with Environment variables](#cloud-storage-with-environment-variables) +will be used. + +### Caching of Cloud Storage + +Triton currently doesn't perform file caching for cloud storage. +However, this functionality can be implemented through +[repository agent API](https://github.com/triton-inference-server/server/blob/bbbcad7d87adc9596f99e3685da5d6b73380514f/docs/customization_guide/repository_agents.md) by injecting a proxy, which checks a specific local directory for caching +given the cloud storage (original path) of the model, +and then decides if cached files may be used. + +## Model Versions + +Each model can have one or more versions available in the model +repository. Each version is stored in its own, numerically named, +subdirectory where the name of the subdirectory corresponds to the +version number of the model. The subdirectories that are not +numerically named, or have names that start with zero (0) will be +ignored. Each model configuration specifies a [version +policy](model_configuration.md#version-policy) that controls which of +the versions in the model repository are made available by Triton at +any given time. + +## Model Files + +The contents of each model version sub-directory is determined by the +type of the model and the requirements of the +[backend](https://github.com/triton-inference-server/backend/blob/main/README.md) +that supports the model. + +### TensorRT Models + +A TensorRT model definition is called a *Plan*. A TensorRT Plan is a +single file that by default must be named model.plan. This default +name can be overridden using the *default_model_filename* property in +the [model configuration](model_configuration.md). + +A TensorRT Plan is specific to a GPU's [CUDA Compute +Capability](https://developer.nvidia.com/cuda-gpus). As a result, +TensorRT models will need to set the *cc_model_filenames* property in +the [model configuration](model_configuration.md) to associate each +Plan file with the corresponding Compute Capability. + +A minimal model repository for a TensorRT model is: + +``` + / + / + config.pbtxt + 1/ + model.plan +``` + +### ONNX Models + +An ONNX model is a single file or a directory containing multiple +files. By default the file or directory must be named model.onnx. +This default name can be overridden using the *default_model_filename* +property in the [model configuration](model_configuration.md). + +Triton supports all ONNX models that are supported by the version of +[ONNX Runtime](https://github.com/Microsoft/onnxruntime) being used by +Triton. Models will not be supported if they use a [stale ONNX opset +version](https://github.com/Microsoft/onnxruntime/blob/master/docs/Versioning.md#version-matrix) +or [contain operators with unsupported +types](https://github.com/microsoft/onnxruntime/issues/1122). + +A minimal model repository for a ONNX model contained in a single file +is: + +``` + / + / + config.pbtxt + 1/ + model.onnx +``` + +An ONNX model composed from multiple files must be contained in a +directory. By default this directory must be named model.onnx but can +be overridden using the *default_model_filename* property in the +[model configuration](model_configuration.md). The main model file +within this directory must be named model.onnx. A minimal model +repository for a ONNX model contained in a directory is: + +``` + / + / + config.pbtxt + 1/ + model.onnx/ + model.onnx + +``` + +### TorchScript Models + +An TorchScript model is a single file that by default must be named +model.pt. This default name can be overridden using the +*default_model_filename* property in the [model +configuration](model_configuration.md). It is possible that some +models traced with different versions of PyTorch may not be supported +by Triton due to changes in the underlying opset. + +A minimal model repository for a TorchScript model is: + +``` + / + / + config.pbtxt + 1/ + model.pt +``` + +### TensorFlow Models + +TensorFlow saves models in one of two formats: *GraphDef* or +*SavedModel*. Triton supports both formats. + +A TensorFlow GraphDef is a single file that by default must be named +model.graphdef. A TensorFlow SavedModel is a directory containing +multiple files. By default the directory must be named +model.savedmodel. These default names can be overridden using the +*default_model_filename* property in the [model +configuration](model_configuration.md). + +A minimal model repository for a TensorFlow +GraphDef model is: + +``` + / + / + config.pbtxt + 1/ + model.graphdef +``` + +A minimal model repository for a TensorFlow SavedModel model is: + +``` + / + / + config.pbtxt + 1/ + model.savedmodel/ + +``` + +### OpenVINO Models + +An OpenVINO model is represented by two files, a *.xml and *.bin +file. By default the *.xml file must be named model.xml. This default +name can be overridden using the *default_model_filename* property in +the [model configuration](model_configuration.md). + +A minimal model repository for an OpenVINO model is: + +``` + / + / + config.pbtxt + 1/ + model.xml + model.bin +``` + +### Python Models + +The [Python +backend](https://github.com/triton-inference-server/python_backend) +allows you to run Python code as a model within Triton. By default the +Python script must be named model.py but this default name can be +overridden using the *default_model_filename* property in the [model +configuration](model_configuration.md). + +A minimal model repository for a Python model is: + +``` + / + / + config.pbtxt + 1/ + model.py +``` + +### DALI Models + +The [DALI backend](https://github.com/triton-inference-server/dali_backend) +allows you to run a [DALI pipeline](https://github.com/NVIDIA/DALI) as +a model within Triton. In order to use this backend, you need to generate +a file, by default named `model.dali`, and include it in your model repository. +Please refer to [DALI backend documentation +](https://github.com/triton-inference-server/dali_backend#how-to-use) for the +description, how to generate `model.dali`. The default model file name can be +overridden using the *default_model_filename* property in the +[model configuration](model_configuration.md). + +A minimal model repository for a DALI model is: + +``` + / + / + config.pbtxt + 1/ + model.dali +``` diff --git a/docs/user_guide/optimization.md b/docs/user_guide/optimization.md new file mode 100644 index 0000000000..5ca3d376b2 --- /dev/null +++ b/docs/user_guide/optimization.md @@ -0,0 +1,450 @@ + + +# Optimization + +The Triton Inference Server has many features that you can use to +decrease latency and increase throughput for your model. This section +discusses these features and demonstrates how you can use them to +improve the performance of your model. As a prerequisite you should +follow the [QuickStart](../getting_started/quickstart.md) to get Triton and client +examples running with the example model repository. + +This section focuses on understanding latency and throughput tradeoffs +for a single model. The [Model Analyzer](model_analyzer.md) section +describes a tool that helps you understand the GPU memory utilization +of your models so you can decide how to best run multiple models on a +single GPU. + +Unless you already have a client application suitable for measuring +the performance of your model on Triton, you should familiarize +yourself with +[Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md). +The Performance Analyzer is an essential tool for optimizing your model's +performance. + +As a running example demonstrating the optimization features and +options, we will use a TensorFlow Inception model that you can obtain +by following the [QuickStart](../getting_started/quickstart.md). As a baseline we use +perf_analyzer to determine the performance of the model using a [basic +model configuration that does not enable any performance +features](../examples/model_repository/inception_graphdef/config.pbtxt). + +``` +$ perf_analyzer -m inception_graphdef --percentile=95 --concurrency-range 1:4 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 1, throughput: 62.6 infer/sec, latency 21371 usec +Concurrency: 2, throughput: 73.2 infer/sec, latency 34381 usec +Concurrency: 3, throughput: 73.2 infer/sec, latency 50298 usec +Concurrency: 4, throughput: 73.4 infer/sec, latency 65569 usec +``` + +The results show that our non-optimized model configuration gives a +throughput of about 73 inferences per second. Note how there is a +significant throughput increase going from one concurrent request to +two concurrent requests and then throughput levels off. With one +concurrent request Triton is idle during the time when the response is +returned to the client and the next request is received at the +server. Throughput increases with a concurrency of two because Triton +overlaps the processing of one request with the communication of the +other. Because we are running perf_analyzer on the same system as +Triton, two requests are enough to completely hide the communication +latency. + +## Optimization Settings + +For most models, the Triton feature that provides the largest +performance improvement is [dynamic +batching](model_configuration.md#dynamic-batcher). +[This example](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_2-improving_resource_utilization#dynamic-batching--concurrent-model-execution) + sheds more light on conceptual details. If your model does not +support batching then you can skip ahead to [Model +Instances](#model-instances). + + +### Dynamic Batcher + +The dynamic batcher combines individual inference requests into a +larger batch that will often execute much more efficiently than +executing the individual requests independently. To enable the dynamic +batcher stop Triton, add the following line to the end of the [model +configuration file for +inception_graphdef](../examples/model_repository/inception_graphdef/config.pbtxt), +and then restart Triton. + +``` +dynamic_batching { } +``` + +The dynamic batcher allows Triton to handle a higher number of +concurrent requests because those requests are combined for +inference. To see this run perf_analyzer with request concurrency from +1 to 8. + +``` +$ perf_analyzer -m inception_graphdef --percentile=95 --concurrency-range 1:8 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 1, throughput: 66.8 infer/sec, latency 19785 usec +Concurrency: 2, throughput: 80.8 infer/sec, latency 30732 usec +Concurrency: 3, throughput: 118 infer/sec, latency 32968 usec +Concurrency: 4, throughput: 165.2 infer/sec, latency 32974 usec +Concurrency: 5, throughput: 194.4 infer/sec, latency 33035 usec +Concurrency: 6, throughput: 217.6 infer/sec, latency 34258 usec +Concurrency: 7, throughput: 249.8 infer/sec, latency 34522 usec +Concurrency: 8, throughput: 272 infer/sec, latency 35988 usec +``` + +With eight concurrent requests the dynamic batcher allows Triton to +provide 272 inferences per second without increasing latency +compared to not using the dynamic batcher. + +Instead of having perf_analyzer collect data for a range of request +concurrency values we can instead use a couple of simple rules that +typically applies when perf_analyzer is running on the same system as +Triton. The first rule is that for minimum latency set the request +concurrency to 1 and disable the dynamic batcher and use only 1 [model +instance](#model-instances). The second rule is that for maximum +throughput set the request concurrency to be +`2 * * `. We will discuss model +instances [below](#model-instances), for now we are working with one model +instance. So for maximum-batch-size 4 we want to run perf_analyzer +with request concurrency of `2 * 4 * 1 = 8`. + +``` +$ perf_analyzer -m inception_graphdef --percentile=95 --concurrency-range 8 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 8, throughput: 267.8 infer/sec, latency 35590 usec +``` + +### Model Instances + +Triton allows you to specify how many copies of each model you want to +make available for inferencing. By default you get one copy of each +model, but you can specify any number of instances in the model +configuration by using [instance +groups](model_configuration.md#instance-groups). Typically, having two +instances of a model will improve performance because it allows +overlap of memory transfer operations (for example, CPU to/from GPU) +with inference compute. Multiple instances also improve GPU +utilization by allowing more inference work to be executed +simultaneously on the GPU. Smaller models may benefit from more than +two instances; you can use perf_analyzer to experiment. + +To specify two instances of the inception_graphdef model: stop Triton, +remove any dynamic batching settings you may have previously added to +the model configuration (we discuss combining dynamic batcher and +multiple model instances below), add the following lines to the end of +the [model configuration +file](../examples/model_repository/inception_graphdef/config.pbtxt), and +then restart Triton. + +``` +instance_group [ { count: 2 }] +``` + +Now run perf_analyzer using the same options as for the baseline. + +``` +$ perf_analyzer -m inception_graphdef --percentile=95 --concurrency-range 1:4 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 1, throughput: 70.6 infer/sec, latency 19547 usec +Concurrency: 2, throughput: 106.6 infer/sec, latency 23532 usec +Concurrency: 3, throughput: 110.2 infer/sec, latency 36649 usec +Concurrency: 4, throughput: 108.6 infer/sec, latency 43588 usec +``` + +In this case having two instances of the model increases throughput +from about 73 inference per second to about 110 inferences per second +compared with one instance. + +It is possible to enable both the dynamic batcher and multiple model +instances, for example, change the model configuration file to include +the following. + +``` +dynamic_batching { } +instance_group [ { count: 2 }] +``` + +When we run perf_analyzer with the same options used for just the +dynamic batcher above. + +``` +$ perf_analyzer -m inception_graphdef --percentile=95 --concurrency-range 16 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 16, throughput: 289.6 infer/sec, latency 59817 usec +``` + +We see that two instances does not improve throughput much while +increasing latency, compared with just using the dynamic batcher and +one instance. This occurs because for this model the dynamic batcher +alone is capable of fully utilizing the GPU and so adding additional +model instances does not provide any performance advantage. In general +the benefit of the dynamic batcher and multiple instances is model +specific, so you should experiment with perf_analyzer to determine the +settings that best satisfy your throughput and latency requirements. + +## Framework-Specific Optimization + +Triton has several optimization settings that apply to only a subset +of the supported model frameworks. These optimization settings are +controlled by the model configuration [optimization +policy](model_configuration.md#optimization-policy). Visit +[this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_4-inference_acceleration) + for an end to end discussion. + +### ONNX with TensorRT Optimization (ORT-TRT) + +One especially powerful optimization is to use TensorRT in +conjunction with an ONNX model. As an example of TensorRT optimization +applied to an ONNX model, we will use an ONNX DenseNet model that you +can obtain by following [QuickStart](../getting_started/quickstart.md). As a baseline we +use perf_analyzer to determine the performance of the model using a +[basic model configuration that does not enable any performance +features](../examples/model_repository/densenet_onnx/config.pbtxt). + +``` +$ perf_analyzer -m densenet_onnx --percentile=95 --concurrency-range 1:4 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 1, 113.2 infer/sec, latency 8939 usec +Concurrency: 2, 138.2 infer/sec, latency 14548 usec +Concurrency: 3, 137.2 infer/sec, latency 21947 usec +Concurrency: 4, 136.8 infer/sec, latency 29661 usec +``` + +To enable TensorRT optimization for the model: stop Triton, add the +following lines to the end of the model configuration file, and then +restart Triton. + +``` +optimization { execution_accelerators { + gpu_execution_accelerator : [ { + name : "tensorrt" + parameters { key: "precision_mode" value: "FP16" } + parameters { key: "max_workspace_size_bytes" value: "1073741824" } + }] +}} +``` + +As Triton starts you should check the console output and wait until +Triton prints the "Staring endpoints" message. ONNX model loading can +be significantly slower when TensorRT optimization is enabled. In +production you can use [model warmup](model_configuration.md#model-warmup) +to avoid this model startup/optimization slowdown. Now +run perf_analyzer using the same options as for the baseline. + +``` +$ perf_analyzer -m densenet_onnx --percentile=95 --concurrency-range 1:4 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 1, 190.6 infer/sec, latency 5384 usec +Concurrency: 2, 273.8 infer/sec, latency 7347 usec +Concurrency: 3, 272.2 infer/sec, latency 11046 usec +Concurrency: 4, 266.8 infer/sec, latency 15089 usec +``` + +The TensorRT optimization provided 2x throughput improvement while +cutting latency in half. The benefit provided by TensorRT will vary +based on the model, but in general it can provide significant +performance improvement. + +### ONNX with OpenVINO Optimization + +ONNX models running on the CPU can also be accelerated by using +[OpenVINO](https://docs.openvinotoolkit.org/latest/index.html). To +enable OpenVINO optimization for an ONNX model, add the following +lines to the end of the model's configuration file. + +``` +optimization { execution_accelerators { + cpu_execution_accelerator : [ { + name : "openvino" + }] +}} +``` + +### TensorFlow with TensorRT Optimization (TF-TRT) + +TensorRT optimization applied to a TensorFlow model works similarly to +TensorRT and ONNX described above. To enable TensorRT optimization you +must set the model configuration appropriately. For TensorRT +optimization of TensorFlow models there are several options that you +can enable, including selection of the compute precision. + +``` +optimization { execution_accelerators { + gpu_execution_accelerator : [ { + name : "tensorrt" + parameters { key: "precision_mode" value: "FP16" }}] +}} +``` + +The options are described in detail in the +[ModelOptimizationPolicy](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +section of the model configuration protobuf. + +As an example of TensorRT optimization applied to a TensorFlow model, +we will use a TensorFlow Inception model that you can obtain by +following the [QuickStart](../getting_started/quickstart.md). As a baseline we use +perf_analyzer to determine the performance of the model using a [basic +model configuration that does not enable any performance +features](../examples/model_repository/inception_graphdef/config.pbtxt). + +``` +$ perf_analyzer -m inception_graphdef --percentile=95 --concurrency-range 1:4 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 1, throughput: 62.6 infer/sec, latency 21371 usec +Concurrency: 2, throughput: 73.2 infer/sec, latency 34381 usec +Concurrency: 3, throughput: 73.2 infer/sec, latency 50298 usec +Concurrency: 4, throughput: 73.4 infer/sec, latency 65569 usec +``` + +To enable TensorRT optimization for the model: stop Triton, add the +lines from above to the end of the model configuration file, and then +restart Triton. As Triton starts you should check the console output +and wait until the server prints the "Staring endpoints" message. Now +run perf_analyzer using the same options as for the baseline. Note +that the first run of perf_analyzer might timeout because the TensorRT +optimization is performed when the inference request is received and +may take significant time. In production you can use [model +warmup](model_configuration.md#model-warmup) to avoid this model +startup/optimization slowdown. For now, if this happens just run +perf_analyzer again. + +``` +$ perf_analyzer -m inception_graphdef --percentile=95 --concurrency-range 1:4 +... +Inferences/Second vs. Client p95 Batch Latency +Concurrency: 1, throughput: 140 infer/sec, latency 8987 usec +Concurrency: 2, throughput: 195.6 infer/sec, latency 12583 usec +Concurrency: 3, throughput: 189 infer/sec, latency 19020 usec +Concurrency: 4, throughput: 191.6 infer/sec, latency 24622 usec +``` + +The TensorRT optimization provided 2.5x throughput improvement while +cutting latency by more than half. The benefit provided by TensorRT +will vary based on the model, but in general it can provide +significant performance improvement. + +### TensorFlow JIT Graph Optimizations + +Tensorflow allows its user to specify the optimization level +while running the model graph via GlobalJitLevel setting. +See [config.proto](https://github.com/tensorflow/tensorflow/blob/v2.10.0/tensorflow/core/protobuf/config.proto) +for more information. When running +TensorFlow models in Triton, the users can provide this setting +by providing graph levels like below: + +``` +optimization { + graph { level: 1 +}} +``` + +The users can also utilize the [XLA optimization](https://www.tensorflow.org/xla) +by setting `TF_XLA_FLAGS` environment variable before launching +Triton. An example to launch Triton with GPU and CPU auto-clustering: + +``` +$ TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit" tritonserver --model-repository=... +``` + +As in the case of TensorRT optimization above, these optimizations +occur when the first inference request is run. To mitigate the +model startup slowdown in production systems, you can use +[model warmup](model_configuration.md#model-warmup). + +### TensorFlow Automatic FP16 Optimization + +TensorFlow has an option to provide FP16 optimization that can be +enabled in the model configuration. As with the TensorRT optimization +described above, you can enable this optimization by using the +gpu_execution_accelerator property. + +``` +optimization { execution_accelerators { + gpu_execution_accelerator : [ + { name : "auto_mixed_precision" } + ] +}} +``` + +The options are described in detail in the +[ModelOptimizationPolicy](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +section of the model configuration protobuf. + +You can follow the steps described above for TensorRT to see how this +automatic FP16 optimization benefits a model by using perf_analyzer +to evaluate the model's performance with and without the optimization. + +## NUMA Optimization + +Many modern CPUs are composed of multiple cores, memories and interconnects that +expose different performance characteristics depending on how threads and +data are allocated. +Triton allows you to set host policies that describe this +[NUMA](https://www.kernel.org/doc/html/latest/mm/numa.html) configuration for +your system and then assign model instances to different host policies +to exploit these NUMA properties. + +### Host Policy + +Triton allows you to specify host policy that associates with a policy name on +startup. A host policy will be applied to a model instance if the instance is +specified with the same policy name by using host policy field in [instance +groups](model_configuration.md#instance-groups). Note that if not specified, +the host policy field will be set to default name based on the instance +property. + +To specify a host policy, you can specify the following in command line option: +``` +--host-policy=,= +``` + +Currently, the supported settings are the following: + +* *numa-node*: The NUMA node id that the host policy will be bound to, the + host policy restricts memory allocation to the node specified. + +* *cpu-cores*: The CPU cores to be run on, the instance with this host policy + set will be running on one of those CPU cores. + +Assuming that the system is configured to bind GPU 0 with NUMA node 0 which has +CPU cores from 0 to 15, the following shows setting the numa-node and cpu-cores +policies for "gpu_0": + +``` +$ tritonserver --host-policy=gpu_0,numa-node=0 --host-policy=gpu_0,cpu-cores=0-15 ... +``` diff --git a/docs/user_guide/perf_analyzer.md b/docs/user_guide/perf_analyzer.md new file mode 100644 index 0000000000..0631b404c5 --- /dev/null +++ b/docs/user_guide/perf_analyzer.md @@ -0,0 +1,30 @@ + + +Perf Analyzer documentation has been relocated to +[here](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md). diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md new file mode 100644 index 0000000000..efea32a63b --- /dev/null +++ b/docs/user_guide/performance_tuning.md @@ -0,0 +1,393 @@ + + +# Deploying your trained model using Triton + +Given a trained model, how do I deploy it at-scale with an optimal configuration +using Triton Inference Server? This document is here to help answer that. + +For those who like a [high level overview](#overview), below is the common flow +for most use cases. + +For those who wish to jump right in, skip to the +[end-to-end example](#end-to-end-example). + +For additional material, see the +[Triton Conceptual Guide tutorial](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_4-inference_acceleration). + +## Overview + +1. Is my model compatible with Triton? + - If your model falls under one of Triton's + [supported backends](https://github.com/triton-inference-server/backend), + then we can simply try to deploy the model as described in the + [Quickstart](../getting_started/quickstart.md) guide. + For the ONNXRuntime, TensorFlow SavedModel, and TensorRT backends, the + minimal model configuration can be inferred from the model using Triton's + [AutoComplete](model_configuration.md#auto-generated-model-configuration) + feature. + This means that a `config.pbtxt` may still be provided, but is not required + unless you want to explicitly set certain parameters. + Additionally, by enabling verbose logging via `--log-verbose=1`, you can see + the complete config that Triton sees internally in the server log output. + For other backends, refer to the + [Minimal Model Configuration](model_configuration.md#minimal-model-configuration) + required to get started. + - If your model does not come from a supported backend, you can look into + the [Python Backend](https://github.com/triton-inference-server/python_backend) + or writing a + [Custom C++ Backend](https://github.com/triton-inference-server/backend/blob/main/examples/README.md) + to support your model. The Python Backend provides a simple interface to + execute requests through a generic python script, but may not be as + performant as a Custom C++ Backend. Depending on your use case, the Python + Backend performance may be a sufficient tradeoff for the simplicity of + implementation. + +2. Can I run inference on my served model? + - Assuming you were able to load your model on Triton, the next step is to + verify that we can run inference requests and get a baseline performance + benchmark of your model. + Triton's + [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) + tool specifically fits this purpose. Here is a simplified output for + demonstration purposes: + + ``` + # NOTE: "my_model" represents a model currently being served by Triton + $ perf_analyzer -m my_model + ... + + Inferences/Second vs. Client Average Batch Latency + Concurrency: 1, throughput: 482.8 infer/sec, latency 12613 usec + ``` + + - This gives us a sanity test that we are able to successfully form input + requests and receive output responses to communicate with the model backend + via Triton APIs. + - If Perf Analyzer fails to send requests and it is unclear from the error + how to proceed, then you may want to sanity check that your model + `config.pbtxt` inputs/outputs match what the model expects. If the config + is correct, check that the model runs successfully using its original + framework directly. If you don't have your own script or tool to do so, + [Polygraphy](https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy) + is a useful tool to run sample inferences on your model via various + frameworks. Currently, Polygraphy supports ONNXRuntime, TensorRT, and + TensorFlow 1.x. + - The definition of "performing well" is subject to change for each use + case. Some common metrics are throughput, latency, and GPU utilization. + There are many variables that can be tweaked just within your model + configuration (`config.pbtxt`) to obtain different results. + - As your model, config, or use case evolves, + [Perf Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md) + is a great tool to quickly verify model functionality and performance. + +3. How can I improve my model performance? + - To further understand the best model configuration you can provide to + Triton for your use case, Triton's + [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) + tool can help. + Model Analyzer can automatically or + [manually](https://github.com/triton-inference-server/model_analyzer/blob/main/docs/config_search.md) + search through config combinations to find the optimal triton configuration + to meet your constraints. After running Model Analyzer to find the optimal + configurations for your model/use case, you can transfer the generated + config files to your [Model Repository](model_repository.md). + Model Analyzer provides a + [Quickstart](https://github.com/triton-inference-server/model_analyzer/blob/main/docs/quick_start.md) + guide with some examples to walk through. + - Upon serving the model with the newly optimized configuration file found + by Model Analyzer and running Perf Analyzer again, you should expect to find + better performance numbers in most cases compared to a default config. + - Some parameters that can be tuned for a model may not be exposed to Model + Analyzer's automatic search since they don't apply to all models. + For instance, [backends](https://github.com/triton-inference-server/backend) + can expose backend-specific configuration options that can be tuned as well. + The [ONNXRuntime + Backend](https://github.com/triton-inference-server/onnxruntime_backend), + for example, has several + [parameters](https://github.com/triton-inference-server/onnxruntime_backend#model-config-options) + that affect the level of parallelization when executing inference on a + model. + These backend-specific options may be worth investigating if the defaults + are not providing sufficient performance. To tune custom sets of + parameters, Model Analyzer supports + [Manual Configuration Search](https://github.com/triton-inference-server/model_analyzer/blob/main/docs/config_search.md). + - To learn more about further optimizations for your model configuration, + see the [Optimization](optimization.md) docs. + +### Other Areas of Interest + +1. My model performs slowly when it is first loaded by Triton +(cold-start penalty), what do I do? + - Triton exposes the ability to run + [ModelWarmup](model_configuration.md#model-warmup) requests when first + loading the model to ensure that the model is sufficiently warmed up before + being marked "READY" for inference. + +2. Why doesn't my model perform significantly faster on GPU? + - Most official backends supported by Triton are optimized for GPU inference + and should perform well on GPU out of the box. + - Triton exposes options for you to optimize your model further on the GPU. + Triton's + [Framework Specific Optimizations](optimization.md#framework-specific-optimization) + goes into further detail on this topic. + - Complete conversion of your model to a backend fully optimized for GPU + inference such as [TensorRT](https://developer.nvidia.com/tensorrt) may + provide even better results. + You may find more Triton-specific details about TensorRT in the + [TensorRT Backend](https://github.com/triton-inference-server/tensorrt_backend). + - If none of the above can help get sufficient GPU-accelerated performance + for your model, the model may simply be better designed for CPU execution + and the [OpenVINO Backend](https://github.com/triton-inference-server/openvino_backend) may + help further optimize your CPU execution. + +## End-to-end Example + +> **Note** +> If you have never worked with Triton before, you may be interested in first +checking out the [Quickstart](../getting_started/quickstart.md) example. +> Some basic understanding of Triton may be useful for the following section, +but this example is meant to be straightforward enough without prior experience. + +Let's take an ONNX model as our example since ONNX is designed to be a format +that can be [easily +exported](https://github.com/onnx/tutorials#converting-to-onnx-format) from most +other frameworks. + +1. Create a [Model Repository](model_repository.md) and download our example +`densenet_onnx` model into it. + +```bash +# Create model repository with placeholder for model and version 1 +mkdir -p ./models/densenet_onnx/1 + +# Download model and place it in model repository +wget -O models/densenet_onnx/1/model.onnx +https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx +``` + +2. Create a minimal [Model Configuration](model_configuration.md) for the +`densenet_onnx` model in our [Model Repository](model_repository.md) at +`./models/densenet_onnx/config.pbtxt`. + +> **Note** +> This is a slightly simplified version of another [example +config](../examples/model_repository/densenet_onnx/config.pbtxt) that utilizes +other [Model Configuration](model_configuration.md) features not necessary for +this example. + +```protobuf +name: "densenet_onnx" +backend: "onnxruntime" +max_batch_size: 0 +input: [ + { + name: "data_0", + data_type: TYPE_FP32, + dims: [ 1, 3, 224, 224] + } +] +output: [ + { + name: "prob_1", + data_type: TYPE_FP32, + dims: [ 1, 1000, 1, 1 ] + } +] +``` + +> **Note** +> As of the 22.07 release, both Triton and Model Analyzer support fully +auto-completing the config file for +[backends that support it](model_configuration.md#auto-generated-model-configuration). +> So for an ONNX model, for example, this step can be skipped unless you want to +explicitly set certain parameters. + +3. Start the server container + +To serve our model, we will use the server container which comes pre-installed +with a `tritonserver` binary. + +```bash +# Start server container +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.09-py3 + +# Start serving your models +tritonserver --model-repository=/mnt/models +``` + +> **Note** +> The `-v $PWD:/mnt` is mounting your current directory on the host into the +`/mnt` directory inside the container. +> So if you created your model repository in `$PWD/models`, you will find it +inside the container at `/mnt/models`. +> You can change these paths as needed. See +[docker volume](https://docs.docker.com/storage/volumes/) docs for more information on +how this works. + + +To check if the model loaded successfully, we expect to see our model in a +`READY` state in the output of the previous command: + +``` +... +I0802 18:11:47.100537 135 model_repository_manager.cc:1345] successfully loaded 'densenet_onnx' version 1 +... ++---------------+---------+--------+ +| Model | Version | Status | ++---------------+---------+--------+ +| densenet_onnx | 1 | READY | ++---------------+---------+--------+ +... +``` + +4. Verify the model can run inference + +To verify our model can perform inference, we will use the `triton-client` +container that we already started which comes with `perf_analyzer` +pre-installed. + +In a separate shell, we use Perf Analyzer to sanity check that we can run +inference and get a baseline for the kind of performance we expect from this +model. + +In the example below, Perf Analyzer is sending requests to models served on the +same machine (`localhost` from the server container via `--network=host`). +However, you may also test models being served remotely at some `:` +by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u +127.0.0.1:8000`. + +```bash +# Start the SDK container interactively +docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.09-py3-sdk + +# Benchmark model being served from step 3 +perf_analyzer -m densenet_onnx --concurrency-range 1:4 +``` + +``` +... +Inferences/Second vs. Client Average Batch Latency +Concurrency: 1, throughput: 265.147 infer/sec, latency 3769 usec +Concurrency: 2, throughput: 890.793 infer/sec, latency 2243 usec +Concurrency: 3, throughput: 937.036 infer/sec, latency 3199 usec +Concurrency: 4, throughput: 965.21 infer/sec, latency 4142 usec +``` + +5. Run Model Analyzer to find the best configurations for our model + +While Model Analyzer comes pre-installed in the SDK (client) container and +supports various modes of connecting to a Triton server, for simplicity we will +use install Model Analyzer in our `server` container to use the `local` +(default) mode. +To learn more about other methods of connecting Model Analyzer to a running +Triton Server, see the `--triton-launch-mode` Model Analyzer flag. + +```bash +# Enter server container interactively +docker exec -ti triton-server bash + +# Stop existing tritonserver process if still running +# because model-analyzer will start its own server +SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` +kill ${SERVER_PID} + +# Install model analyzer +pip install --upgrade pip +pip install triton-model-analyzer wkhtmltopdf + +# Profile the model using local (default) mode +# NOTE: This may take some time, in this example it took ~10 minutes +model-analyzer profile \ + --model-repository=/mnt/models \ + --profile-models=densenet_onnx \ + --output-model-repository-path=results + +# Summarize the profiling results +model-analyzer analyze --analysis-models=densenet_onnx +``` + +Example Model Analyzer output summary: + +> In 51 measurements across 6 configurations, `densenet_onnx_config_3` provides +the best throughput: **323 infer/sec**. +> +> **This is a 92% gain over the default configuration (168 infer/sec), under the +given constraints.** + +| Model Config Name | Max Batch Size | Dynamic Batching | Instance Count | p99 Latency (ms) | Throughput (infer/sec) | Max GPU Memory Usage (MB) | Average GPU Utilization (%) | +|---|---|---|---|---|---|---|---| +| densenet_onnx_config_3 | 0 | Enabled | 4/GPU | 35.8 | 323.13 | 3695 | 58.6 | +| densenet_onnx_config_2 | 0 | Enabled | 3/GPU | 59.575 | 295.82 | 3615 | 58.9 | +| densenet_onnx_config_4 | 0 | Enabled | 5/GPU | 69.939 | 291.468 | 3966 | 58.2 | +| densenet_onnx_config_default | 0 | Disabled | 1/GPU | 12.658 | 167.549 | 3116 | 51.3 | + +In the table above, we see that setting our GPU [Instance +Count](model_configuration.md#instance-groups) to 4 allows us to achieve the +highest throughput and almost lowest latency on this system. + +Also, note that this `densenet_onnx` model has a fixed batch-size that is +explicitly specified in the first dimension of the Input/Output `dims`, +therefore the `max_batch_size` parameter is set to 0 as described +[here](model_configuration.md#maximum-batch-size). +For models that support dynamic batch size, Model Analyzer would also tune the +`max_batch_size` parameter. + +> **Warning** +> These results are specific to the system running the Triton server, so for +example, on a smaller GPU we may not see improvement from increasing the GPU +instance count. +> In general, running the same configuration on systems with different hardware +(CPU, GPU, RAM, etc.) may provide different results, so it is important to +profile your model on a system that accurately reflects where you will deploy +your models for your use case. + +6. Extract optimal config from Model Analyzer results + +In our example above, `densenet_onnx_config_3` was the optimal configuration. +So let's extract that `config.pbtxt` and put it back in our model repository for future use. + +```bash +# (optional) Backup our original config.pbtxt (if any) to another directory +cp /mnt/models/densenet_onnx/config.pbtxt /tmp/original_config.pbtxt + +# Copy over the optimal config.pbtxt from Model Analyzer results to our model repository +cp ./results/densenet_onnx_config_3/config.pbtxt /mnt/models/densenet_onnx/ +``` + +Now that we have an optimized Model Configuration, we are ready to take our +model to deployment. For further manual tuning, read the [Model +Configuration](model_configuration.md) and [Optimization](optimization.md) docs +to learn more about Triton's complete set of capabilities. + +In this example, we happened to get both the highest throughput and almost +lowest latency from the same configuration, but in some cases this is a tradeoff +that must be made. Certain models or configurations may achieve a higher +throughput but also incur a higher latency in return. It is worthwhile to fully +inspect the reports generated by Model Analyzer to ensure your model performance +meets your requirements. diff --git a/docs/user_guide/ragged_batching.md b/docs/user_guide/ragged_batching.md new file mode 100644 index 0000000000..308b75fa57 --- /dev/null +++ b/docs/user_guide/ragged_batching.md @@ -0,0 +1,139 @@ + + +# Ragged Batching + +Triton provides [dynamic batching feature](model_configuration.md#dynamic-batcher), +which combines multiple requests for the same model execution to provide larger +throughput. By default, the requests can be dynamically batched only if +each input has the same shape across the requests. In order to exploit dynamic +batching for cases where input shapes often vary, the client would need to pad +the input tensors in the requests to the same shape. + +Ragged batching is a feature to avoid explicit padding by allowing user to +specify which of the inputs doesn't require the shape check. User can specify +such input (ragged input) by setting `allow_ragged_batch` field in the model +config: + +``` +... +input [ + { + name: "input0" + data_type: TYPE_FP32 + dims: [ 16 ] + allow_ragged_batch: true + } +] +... +``` + +How ragged input are processed in a batch of requests depends on the backend +implementation. The backends, such as +[ONNX Runtime backend](https://github.com/triton-inference-server/onnxruntime_backend), +[TensorFlow backend](https://github.com/triton-inference-server/tensorflow_backend), +[PyTorch backend](https://github.com/triton-inference-server/pytorch_backend), +and [TensorRT backend](https://github.com/triton-inference-server/tensorrt_backend), +require models to accept ragged inputs as 1-dimensional tensors. +These backends concatenates the request inputs into the 1-dimensional tensor. + +Because the concatenated input doesn't track the start and end index for each +request, the backends often require the model to have additional input(s), +[batch input](#batch-input), that describe various information about the batch +formed. + +## Batch Input + +Batch input is often used in combination with ragged input to provide +information about each batch element, such as the element count +of an input for each request in the batch. A batch input is generated by +Triton instead of being provided in the request, because the information can +only be finalized after the dynamic batch is formed. + +Besides element count, +there are other batch input kinds that the user can specify, see the +[protobuf documentation](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +for details. + +## Example on Ragged Input and Batch Input + +If you have a model that accepts 1 variable length input tensor, INPUT, with +shape [ -1, -1 ]. The first dimension is the batch dimension, and the second +dimension is the variable-length content. When the client sends 3 requests of +shapes [ 1, 3 ], [ 1, 4 ], [ 1, 5 ]. To exploit dynamic batching, the +straight-forward way to implement this model would expect INPUT shape [ -1, -1 ] +and assume that all inputs were padded to same length so that all requests +become shape [ 1, 5 ] and thus Triton can batch and send them to the model +as a single [ 3, 5 ] tensor. In this case, there will be overhead on padding +the tensor and on extra model computation on the padded content. +Below is the input config: + +``` +max_batch_size: 16 +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +``` + +With triton ragged batching, the model will be implemented to expect INPUT shape +[ -1 ] and an additional batch input, INDEX, shape [ -1 ] which the model should +use to interpret the batch elements in INPUT. For such model, +the client requests don't need to be padded and they can be sent as they are +(with shapes [ 1, 3 ], [ 1, 4 ], [ 1, 5 ]). The backends discussed above will +batch the input into a tensor of shape [ 12 ] which contains the 3 + 4 + 5 +concatenation of the requests. Triton also creates the batch input tensor of +shape [ 3 ] with value [ 3, 7, 12 ] which gives the offset into the input tensor +where each batch element ends. Below is the input config: + +``` +max_batch_size: 16 +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ -1 ] + allow_ragged_batch: true + } +] +batch_input [ + { + kind: BATCH_ACCUMULATED_ELEMENT_COUNT + target_name: "INDEX" + data_type: TYPE_FP32 + source_input: "INPUT" + } +] +``` + +The above example uses +[`BATCH_ACCUMULATED_ELEMENT_COUNT`](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) +type of ragged batching. Other types described in [protobuf documentation](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) operate similarly. \ No newline at end of file diff --git a/docs/user_guide/rate_limiter.md b/docs/user_guide/rate_limiter.md new file mode 100644 index 0000000000..69b94fd8b8 --- /dev/null +++ b/docs/user_guide/rate_limiter.md @@ -0,0 +1,131 @@ + + +# Rate Limiter + +Rate limiter manages the rate at which requests are scheduled on +model instances by Triton. The rate limiter operates across all +models loaded in Triton to allow *cross-model prioritization*. + +In absence of rate limiting (--rate-limit=off), Triton schedules +execution of a request (or set of requests when using dynamic +batching) as soon as a model instance is available. This behavior +is typically best suited for performance. However, there can be +cases where running all the models simultaneously places excessive +load on the server. For instance, model execution on some +frameworks dynamically allocate memory. Running all such models +simultaneously may lead to system going out-of-memory. + +Rate limiter allows to postpone the inference execution on some +model instances such that not all of them runs simultaneously. +The model priorities are used to decide which model instance +to schedule next. + +## Using Rate Limiter + +To enable rate limiting users must set `--rate-limit` option when +launching tritonserver. For more information, consult usage of +the option emitted by `tritonserver --help`. + +The rate limiter is controlled by the rate limiter configuration given +for each model instance, as described in [rate limiter +configuration](model_configuration.md#rate-limiter-configuration). +The rate limiter configuration includes +[resources](model_configuration.md#resources) and +[priority](model_configuration.md#priority) for the model instances +defined by the instance group. + +### Resources + +Resources are identified by a unique name and a count indicating +the number of copies of the resource. By default, model instance +uses no rate-limiter resources. By listing a resource/count the +model instance indicates that it requires that many resources to +be available on the model instance device before it can be allowed +to execute. When under execution the specified many resources are +allocated to the model instance only to be released when the +execution is over. The available number of resource copies +are, by default, the max across all model instances that list that +resource. For example, assume three loaded model instances A, B +and C each specifying the following resource requirements for +a single device: + +``` +A: [R1: 4, R2: 4] +B: [R2: 5, R3: 10, R4: 5] +C: [R1: 1, R3: 7, R4: 2] +``` + +By default, based on those model instance requirements, the server +will create the following resources with the indicated copies: + +``` +R1: 4 +R2: 5 +R3: 10 +R4: 5 +``` + +These values ensure that all model instances can be successfully +scheduled. The default for a resource can be overridden by giving +it explicitly on command-line using `--rate-limit-resource` option. +`tritonserver --help` will provide with more detailed usage +instructions. + +By default, the available resource copies are per-device and resource +requirements for a model instance are enforced against corresponding +resources associated with the device where the model instance runs. +The `--rate-limit-resource` allows users to provide different resource +copies to different devices. Rate limiter can also handle global +resources. Instead of creating resource copies per-device, a global +resource will have a single copy all across the system. + +Rate limiter depends upon the model configuration to determine +whether the resource is global or not. See +[resources](model_configuration.md#resources) for more details on +how to specify them in model configuration. + +For tritonserver, running on a two device machine, invoked with +`--rate-limit-resource=R1:10 --rate-limit-resource=R2:5:0 --rate-limit-resource=R2:8:1 --rate-limit-resource=R3:2` +, available resource copies are: + +``` +GLOBAL => [R3: 2] +DEVICE 0 => [R1: 10, R2: 5] +DEVICE 1 => [R1: 10, R2: 8] +``` + +where R3 appears as a global resource in one of the loaded model. + +### Priority + +In a resource constrained system, there will be a contention for +the resources among model instances to execute their inference +requests. Priority setting helps determining which model instance +to select for next execution. See [priority](model_configuration.md#priority) +for more information. diff --git a/docs/user_guide/request_cancellation.md b/docs/user_guide/request_cancellation.md new file mode 100644 index 0000000000..8db4e3b8c1 --- /dev/null +++ b/docs/user_guide/request_cancellation.md @@ -0,0 +1,102 @@ + + +# Request Cancellation + +Starting from r23.10, Triton supports handling request cancellation received +from the gRPC client or a C API user. Long running inference requests such +as for auto generative large language models may run for an indeterminate +amount of time or indeterminate number of steps. Additionally clients may +enqueue a large number of requests as part of a sequence or request stream +and later determine the results are no longer needed. Continuing to process +requests whose results are no longer required can significantly impact server +resources. + +## Issuing Request Cancellation + +### In-Process C API + +[In-Process Triton Server C API](../customization_guide/inference_protocols.md#in-process-triton-server-api) has been enhanced with `TRITONSERVER_InferenceRequestCancel` +and `TRITONSERVER_InferenceRequestIsCancelled` to issue cancellation and query +whether cancellation has been issued on an inflight request respectively. Read more +about the APIs in [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). + + +### gRPC Endpoint + +In addition, [gRPC endpoint](../customization_guide/inference_protocols.md#httprest-and-grpc-protocols) can +now detect cancellation from the client and attempt to terminate request. +At present, only gRPC python client supports issuing request cancellation +to the server endpoint. See [request-cancellation](https://github.com/triton-inference-server/client#request-cancellation) +for more details on how to issue requests from the client-side. +See gRPC guide on RPC [cancellation](https://grpc.io/docs/guides/cancellation/) for +finer details. + +## Handling in Triton Core + +Triton core checks for requests that have been cancelled at some critical points +when using [dynamic](./model_configuration.md#dynamic-batcher) or +[sequence](./model_configuration.md#sequence-batcher) batching. The checking is +also performed between each +[ensemble](./model_configuration.md#ensemble-scheduler) steps and terminates +further processing if the request is cancelled. + +On detecting a cancelled request, Triton core responds with CANCELLED status. If a request +is cancelled when using [sequence_batching](./model_configuration.md#sequence-batcher), +then all the pending requests in the same sequence will also be cancelled. The sequence +is represented by the requests that has identical sequence id. + +**Note**: Currently, Triton core does not detect cancellation status of a request once +it is forwarded to [rate limiter](./rate_limiter.md). Improving the request cancellation +detection and handling within Triton core is work in progress. + +## Handling in Backend + +Upon receiving request cancellation, Triton does its best to terminate request +at various points. However, once a request has been given to the backend +for execution, it is up to the individual backends to detect and handle +request termination. +Currently, the following backends support early termination: +- [TensorRT-LLM backend](https://github.com/triton-inference-server/tensorrtllm_backend) +- [vLLM backend](https://github.com/triton-inference-server/vllm_backend) +- [python backend](https://github.com/triton-inference-server/python_backend) + +Python backend is a special case where we expose the APIs to detect cancellation +status of the request but it is up to the `model.py` developer to detect whether +the request is cancelled and terminate further execution. + +**For the backend developer**: The backend APIs have also been enhanced to let the +backend detect whether the request received from Triton core has been cancelled. +See `TRITONBACKEND_RequestIsCancelled` and `TRITONBACKEND_ResponseFactoryIsCancelled` +in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h) +for more details. The backend upon detecting request cancellation can stop processing +it any further. +The Python models running behind Python backend can also query the cancellation status +of request and response_sender. See [this](https://github.com/triton-inference-server/python_backend#request-cancellation-handling) +section in python backend documentation for more details. + diff --git a/docs/user_guide/response_cache.md b/docs/user_guide/response_cache.md new file mode 100644 index 0000000000..14106db6de --- /dev/null +++ b/docs/user_guide/response_cache.md @@ -0,0 +1,268 @@ + + +# Triton Response Cache + +## Overview + +In this document an *inference request* is the model name, model version, and +input tensors (name, shape, datatype and tensor data) that make up a request +submitted to Triton. An inference result is the output tensors (name, shape, +datatype and tensor data) produced by an inference execution. The response cache +is used by Triton to hold inference results generated for previous executed +inference requests. Triton will maintain the response cache so that inference +requests that hit in the cache will not need to execute a model to produce +results and will instead extract their results from the cache. For some use +cases this can significantly reduce the inference request latency. + +Triton accesses the response cache with a hash of the inference request that +includes the model name, model version and model inputs. If the hash is found in +the cache, the corresponding inference result is extracted from the cache and +used for the request. When this happens there is no need for Triton to execute +the model to produce the inference result. If the hash is not found in the +cache, Triton executes the model to produce the inference result, and then +records that result in the cache so that subsequent inference requests can +(re)use those results. + +## Usage + +In order for caching to be used on a given model, it must be enabled +on both the server-side, and in the model's +[model config](model_configuration.md#response-cache). See the following +sections below for more details. + +### Enable Caching on Server-side + +The response cache is enabled on the server-side by specifying a cache +implementation name `` and corresponding configuration when starting +the Triton server. + +Through the CLI, this translates to setting +`tritonserver --cache-config ,= ...`. For example: +``` +tritonserver --cache-config local,size=1048576 +``` + +> [!NOTE] +> If using a non-interactive shell, you may need to specify the argument without +> the space like so: `--cache-config=,=`. + +For in-process C API applications, this translates to calling +`TRITONSERVER_SetCacheConfig(const char* cache_implementation, const char* config_json)`. + +This allows users to enable/disable caching globally on server startup. + +### Enable Caching for a Model + +**By default, no model uses response caching even if the response cache +is enabled globally with the `--cache-config` flag.** + +For a given model to use response caching, the model must also have +response caching enabled in its model configuration: +``` +# config.pbtxt + +response_cache { + enable: true +} +``` + +This allows users to enable/disable caching for specific models. + +For more information on enabling the response cache for each model, see the +[model configuration docs](model_configuration.md#response-cache). + +### Cache Implementations + +Starting in the 23.03 release, Triton has a set of +[TRITONCACHE APIs](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritoncache.h) +that are used to communicate with a cache implementation of the user's choice. + +A cache implementation is a shared library that implements the required +TRITONCACHE APIs and is dynamically loaded on server startup, if enabled. + +Triton's most recent +[tritonserver release containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +come with the following cache implementations out of the box: +- [local](https://github.com/triton-inference-server/local_cache): `/opt/tritonserver/caches/local/libtritoncache_local.so` +- [redis](https://github.com/triton-inference-server/redis_cache): `/opt/tritonserver/caches/redis/libtritoncache_redis.so` + +With these TRITONCACHE APIs, `tritonserver` exposes a new `--cache-config` +CLI flag that gives the user flexible customization of which cache implementation +to use, and how to configure it. Similar to the `--backend-config` flag, +the expected format is `--cache-config ,=` and may +be specified multiple times to specify multiple keys if the cache implementation +requires it. + +#### Local Cache + +The `local` cache implementation is equivalent to the response cache used +internally before the 23.03 release. For more implementation specific details, +see the +[local cache implementation](https://github.com/triton-inference-server/local_cache). + +When `--cache-config local,size=SIZE` is specified with a non-zero `SIZE`, +Triton allocates the requested size in CPU memory and **shares the +cache across all inference requests and across all models**. + +#### Redis Cache + +The `redis` cache implementation exposes the ability for Triton to communicate +with a Redis server for caching. The `redis_cache` implementation is essentially +a Redis client that acts as an intermediary between Triton and Redis. + +To list a few benefits of the `redis` cache compared to the `local` cache in +the context of Triton: +- The Redis server can be hosted remotely as long as it is accessible by Triton, + so it is not tied directly to the Triton process lifetime. + - This means Triton can be restarted and still have access to previously cached entries. + - This also means that Triton doesn't have to compete with the cache for memory/resource usage. +- Multiple Triton instances can share a cache by configuring each Triton instance + to communicate with the same Redis server. +- The Redis server can be updated/restarted independently of Triton, and + Triton will fallback to operating as it would with no cache access during + any Redis server downtime, and log appropriate errors. + +In general, the Redis server can be configured/deployed as needed for your use +case, and Triton's `redis` cache will simply act as a client of your Redis +deployment. The [Redis docs](https://redis.io/docs/) should be consulted for +questions and details about configuring the Redis server. + +For Triton-specific `redis` cache implementation details/configuration, see the +[redis cache implementation](https://github.com/triton-inference-server/redis_cache). + +#### Custom Cache + +With the TRITONCACHE API interface, it is now possible for +users to implement their own cache to suit any use-case specific needs. +To see the required interface that must be implemented by a cache +developer, see the +[TRITONCACHE API header](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritoncache.h). +The `local` or `redis` cache implementations may be used as reference. + +Upon successfully developing and building a custom cache, the resulting shared +library (ex: `libtritoncache_.so`) must be placed in the cache directory +similar to where the `local` and `redis` cache implementations live. By default, +this directory is `/opt/tritonserver/caches`, but a custom directory may be +specified with `--cache-dir` as needed. + +To put this example together, if the custom cache were named "custom" +(this name is arbitrary), by default Triton would expect to find the +cache implementation at `/opt/tritonserver/caches/custom/libtritoncache_custom.so`. + +## Deprecation Notes + +> **Note** +> Prior to 23.03, enabling the `local` cache used to be done through setting a non-zero size +> (in bytes) when Triton was launched using the `--response-cache-byte-size` flag. +> +> Starting in 23.03, the `--response-cache-byte-size` flag is now deprecated and +> `--cache-config` should be used instead. For backwards compatibility, +> `--response-cache-byte-size` will continue to function under the hood by being +> converted to the corresponding `--cache-config` argument, but it will default +> to using the `local` cache implementation. It is not possible to choose other +> cache implementations using the `--response-cache-byte-size` flag. +> +> For example, `--response-cache-byte-size 1048576` +> would be equivalent to `--cache-config local,size=1048576`. However, the +> `--cache-config` flag is much more flexible and should be used instead. + +> **Warning** +> +> The `local` cache implementation may fail to initialize for very small values +> of `--cache-config local,size=` or `--response-cache-byte-size` +> (ex: less than 1024 bytes) due to internal memory management requirements. +> If you encounter an initialization error for a relatively small cache size, +> try increasing it. +> +> Similarly, the size is upper bounded by the available RAM on the system. +> If you encounter an initial allocation error for a very large cache size +> setting, try decreasing it. + +## Performance + +The response cache is intended to be used for use cases where a significant +number of duplicate requests (cache hits) are expected and therefore would +benefit from caching. The term "significant" here is subjective to the use +case, but a simple interpretation would be to consider the proportion of +expected cache hits/misses, as well as the average time spend computing +a response. + +For cases where cache hits are common and computation is expensive, +the cache can significantly improve overall performance. + +For cases where most requests are unique (cache misses) or the compute is +fast/cheap (the model is not compute-bound), the cache can negatively impact +the overall performance due to the overhead of managing and communicating with +the cache. + +## Ensemble Model Caching + +Top-level requests to ensemble models support caching if all composing models +within the ensemble support caching as well. + +Similarly, if a composing model in the ensemble doesn't support caching, +then the ensemble model would inherit this limitation and not support +caching either. See the known limitations below for what types of models +support caching. + +A cache hit on an ensemble will skip sending requests to the composing models +entirely, and return the cached response from the ensemble model. + +A cache miss on an ensemble will fallback to standard inference and the request +will proceed to the composing models as usual. + +The ensemble and its composing models can independently enable caching, and +each maintain their own caches when enabled. It is possible for a request +to be a cache miss at the ensemble level, but then for an intermediate model +within the ensemble to have a cache hit, depending on the inputs and outputs +of models being composed. Composing models do not need to enable caching to +enable it at the ensemble level. + + +## Known Limitations + +- Only input tensors located in CPU memory will be hashable for accessing the + cache. If an inference request contains input tensors not in CPU memory, the + request will not be hashed and therefore the response will not be cached. +- Only responses with all output tensors located in CPU memory will be eligible + for caching. If any output tensor in a response is not located in CPU memory, + the response will not be cached. +- The cache is accessed using only the inference request hash. As a result, if + two different inference requests generate the same hash (a hash collision), + then Triton may incorrectly use the cached result for an inference request. + The hash is a 64-bit value so the likelihood of collision is small. +- Only successful inference requests will have their responses cached. If a + request fails or returns an error during inference, its response will not be + cached. +- Only requests going through the Default Scheduler or Dynamic Batch Scheduler + are eligible for caching. The Sequence Batcher does not currently support + response caching. +- The response cache does not currently support + [decoupled models](decoupled_models.md). + diff --git a/docs/user_guide/trace.md b/docs/user_guide/trace.md new file mode 100644 index 0000000000..8f7708665b --- /dev/null +++ b/docs/user_guide/trace.md @@ -0,0 +1,654 @@ + + +# Triton Server Trace + +Triton includes that capability to generate a detailed trace for +individual inference requests. Tracing is enable by command-line +arguments when running the tritonserver executable. + +`--trace-config` command line option in Triton can be used to specify +global and trace mode specific config setting. The format of this flag +is `--trace-config ,=`, where `` +is either `triton` or `opentelemetry`. By default, the trace mode is set to `triton`, +and the server will use Triton's trace APIs. For `opentelemetry` mode, +the server will use the [OpenTelemetry's APIs](#opentelemetry-trace-support) to generate, +collect and export traces for individual inference requests. + +To specify global trace settings (level, rate, count, or mode), +the format is `--trace-config =`. + +An example usage, which invokes Triton's trace APIs: + +``` +$ tritonserver \ + --trace-config triton,file=/tmp/trace.json \ + --trace-config triton,log-frequency=50 \ + --trace-config rate=100 \ + --trace-config level=TIMESTAMPS \ + --trace-config count=100 ... +``` + +## Trace Settings +### Global Settings +The following table shows available global trace settings to pass to `--trace-config` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SettingDefault ValueDescription
rate1000 + Specifies the sampling rate. The same as deprecated + --trace-rate.
+ For example, a value of 1000 specifies that every 1000-th inference
+ request will be traced. +
levelOFF + Indicates the level of trace detail that should be collected and
+ may be specified multiple times to trace multiple information.
+ The same as deprecated --trace-level.
+ Choices are TIMESTAMPS and TENSORS.
+ Note that opentelemetry mode does not currently
+ support TENSORS level. +
count-1 + Specifies the remaining number of traces to be collected.
+ The default value of -1 specifies to never stop collecting traces.
+ With a value of 100, Triton will stop tracing requests
+ after 100 traces are collected.
+ The same as deprecated --trace-count. +
modetriton + Specifies which trace APIs to use for collecting traces.
+ The choices are triton or opentelemetry.
+
+ +### Triton Trace APIs Settings + +The following table shows available Triton trace APIs settings for +`--trace-config triton,=`. + + + + + + + + + + + + + + + + + + + + +
SettingDefault ValueDescription
fileempty string + Indicates where the trace output should be written.
+ The same as deprecated --trace-file.
+
log-frequency0 + Specifies the rate that the traces are written to file.
+ For example, a value of 50 specifies that Triton will log
+ to file for every 50 traces collected.
+ The same as deprecated --trace-log-frequency.
+
+ +In addition to the trace configuration settings in the command line, you can +modify the trace configuration using the [trace +protocol](../protocol/extension_trace.md). This option is currently not supported, +when trace mode is set to `opentelemetry`. + +**Note**: the following flags are **deprecated**: + +The `--trace-file` option indicates where the trace output should be +written. The `--trace-rate` option specifies the sampling rate. In +this example every 100-th inference request will be traced. The +`--trace-level` option indicates the level of trace detail that should +be collected. `--trace-level` option may be specified multiple times to +trace multiple information. The `--trace-log-frequency` option specifies the +rate that the traces are written to file. In this example Triton will log to +file for every 50 traces collected. The `--trace-count` option specifies the +remaining number of traces to be collected. In this example Triton will stop +tracing more requests after 100 traces are collected. Use the `--help` option +to get more information. + +## Supported Trace Level Option + +- `TIMESTAMPS`: Tracing execution timestamps of each request. +- `TENSORS`: Tracing input and output tensors during the execution. + +## JSON Trace Output + +The trace output is a JSON file with the following schema. + +``` +[ + { + "model_name": $string, + "model_version": $number, + "id": $number, + "request_id": $string, + "parent_id": $number + }, + { + "id": $number, + "timestamps": [ + { "name" : $string, "ns" : $number } + ] + }, + { + "id": $number + "activity": $string, + "tensor":{ + "name": $string, + "data": $string, + "shape": $string, + "dtype": $string + } + }, + ... +] +``` + +Each trace is assigned a "id", which indicates the model name and +version of the inference request. If the trace is from a +model run as part of an ensemble, the "parent_id" will indicate the +"id" of the containing ensemble. +For example: +``` +[ + { + "id": 1, + "model_name": "simple", + "model_version": 1 + }, + ... +] +``` + +Each `TIMESTAMPS` trace will have one or more "timestamps" with +each timestamp having a name and the timestamp in nanoseconds ("ns"). +For example: + +``` +[ + {"id": 1, "timestamps": [{ "name": "HTTP_RECV_START", "ns": 2356425054587444 }] }, + {"id": 1, "timestamps": [{ "name": "HTTP_RECV_END", "ns": 2356425054632308 }] }, + {"id": 1, "timestamps": [{ "name": "REQUEST_START", "ns": 2356425054785863 }] }, + {"id": 1, "timestamps": [{ "name": "QUEUE_START", "ns": 2356425054791517 }] }, + {"id": 1, "timestamps": [{ "name": "INFER_RESPONSE_COMPLETE", "ns": 2356425057587919 }] }, + {"id": 1, "timestamps": [{ "name": "COMPUTE_START", "ns": 2356425054887198 }] }, + {"id": 1, "timestamps": [{ "name": "COMPUTE_INPUT_END", "ns": 2356425057152908 }] }, + {"id": 1, "timestamps": [{ "name": "COMPUTE_OUTPUT_START", "ns": 2356425057497763 }] }, + {"id": 1, "timestamps": [{ "name": "COMPUTE_END", "ns": 2356425057540989 }] }, + {"id": 1, "timestamps": [{ "name": "REQUEST_END", "ns": 2356425057643164 }] }, + {"id": 1, "timestamps": [{ "name": "HTTP_SEND_START", "ns": 2356425057681578 }] }, + {"id": 1, "timestamps": [{ "name": "HTTP_SEND_END", "ns": 2356425057712991 }] } +] +``` + +Each `TENSORS` trace will contain an "activity" and a "tensor". +"activity" indicates the type of tensor, including "TENSOR_QUEUE_INPUT" +and "TENSOR_BACKEND_OUTPUT" by now. "tensor" has the detail of tensor, +including its "name", "data" and "dtype". For example: + +``` +[ + { + "id": 1, + "activity": "TENSOR_QUEUE_INPUT", + "tensor":{ + "name": "input", + "data": "0.1,0.1,0.1,...", + "shape": "1,16", + "dtype": "FP32" + } + } +] +``` + +## Trace Summary Tool + +An example [trace summary tool](https://github.com/triton-inference-server/server/blob/main/qa/common/trace_summary.py) can be +used to summarize a set of traces collected from Triton. Basic usage +is: + +``` +$ trace_summary.py +``` + +This produces a summary report for all traces in the file. HTTP and +GRPC inference requests are reported separately. + +``` +File: trace.json +Summary for simple (-1): trace count = 1 +HTTP infer request (avg): 403.578us + Receive (avg): 20.555us + Send (avg): 4.52us + Overhead (avg): 24.592us + Handler (avg): 353.911us + Overhead (avg): 23.675us + Queue (avg): 18.019us + Compute (avg): 312.217us + Input (avg): 24.151us + Infer (avg): 244.186us + Output (avg): 43.88us +Summary for simple (-1): trace count = 1 +GRPC infer request (avg): 383.601us + Send (avg): 62.816us + Handler (avg): 392.924us + Overhead (avg): 51.968us + Queue (avg): 21.45us + Compute (avg): 319.506us + Input (avg): 27.76us + Infer (avg): 227.844us + Output (avg): 63.902us +``` + +Note: The "Receive (avg)" metric is not included in the gRPC summary as gRPC library does not provide any non-intrusive hooks to detect time spent in reading a message from the wire. Tracing an HTTP request will provide an accurate measurement of time spent reading a request from the network. + +Use the -t option to get a summary for each trace in the file. This +summary shows the time, in microseconds, between different points in +the processing of an inference request. For example, the below output +shows that it took 15us from the start of handling the request until +the request was enqueued in the scheduling queue. + +``` +$ trace_summary.py -t +... +simple (-1): + request handler start + 15us + queue start + 20us + compute start + 266us + compute end + 4us + request handler end + 19us + grpc send start + 77us + grpc send end +... +``` + +The script can also show the data flow of the first request if there are +`TENSORS` traces in the file. If the `TENSORS` traces are from an ensemble, +the data flow will be shown with the dependency of each model. + +``` +... +Data Flow: + ========================================================== + Name: ensemble + Version:1 + QUEUE_INPUT: + input: [[0.705676 0.830855 0.833153]] + BACKEND_OUTPUT: + output: [[1. 2. 7. 0. 4. 7. 9. 3. 4. 9.]] + ========================================================== + ================================================== + Name: test_trt1 + Version:1 + QUEUE_INPUT: + input: [[0.705676 0.830855 0.833153]] + BACKEND_OUTPUT: + output1: [[1. 1. ...]] + ================================================== + ================================================== + Name: test_trt2 + Version:1 + QUEUE_INPUT: + input: [[0.705676 0.830855 0.833153]] + BACKEND_OUTPUT: + output2: [[2. 2. ...]] + ================================================== + ================================================== + Name: test_py + Version:1 + QUEUE_INPUT: + output1: [[1. 1. ...]] + QUEUE_INPUT: + output2: [[2. 2. ...]] + BACKEND_OUTPUT: + output: [[1. 2. 7. 0. 4. 7. 9. 3. 4. 9.]] + ================================================== +... +``` + +The meaning of the trace timestamps is: + +* HTTP Request Receive: Collected only for inference requests that use the + HTTP protocol. The time required to read the inference request from + the network. + +* Send: The time required to send the inference response. + +* Overhead: Additional time required in the HTTP endpoint to + process the inference request and response. + +* Handler: The total time spent handling the inference request, not + including the HTTP and GRPC request/response handling. + + * Queue: The time the inference request spent in the scheduling queue. + + * Compute: The time the inference request spent executing the actual + inference. This time includes the time spent copying input and + output tensors. If --trace-level=TIMESTAMPS then a breakdown of the + compute time will be provided as follows: + + * Input: The time to copy input tensor data as required by the + inference framework / backend. This includes the time to copy + input tensor data to the GPU. + + * Infer: The time spent executing the model to perform the + inference. + + * Output: The time to copy output tensor data as required by the + inference framework / backend. This includes the time to copy + output tensor data from the GPU. + + * Overhead: Additional time required for request handling not + covered by Queue or Compute times. + +* Data Flow: The data flow of the first request. It contains the input and + output tensors of each part of execution. + + * Name: The name of model. + + * Version: The version of model. + + * QUEUE_INPUT: The tensor entering the queue of a backend to wait for + scheduling. + + * BACKEND_OUTPUT: The tensor in the response of a backend. + +## Tracing for BLS models + +Triton does not collect traces for child models invoked from +[BLS](https://github.com/triton-inference-server/python_backend/tree/main#business-logic-scripting) +models by default. + +To include child models into collected traces, user needs to provide the `trace` +argument (as shown in the example below), when constructing an InferenceRequest object. +This helps Triton associate the child model with the parent model's trace (`request.trace()`). + +```python + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + ... + def execute(self, requests): + ... + for request in requests: + ... + inference_request = pb_utils.InferenceRequest( + model_name='model_name', + requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'], + inputs=[], trace = request.trace()) + +``` + +## OpenTelemetry trace support + +Triton provides an option to generate and export traces using +[OpenTelemetry APIs and SDKs](https://opentelemetry.io/). + +To specify OpenTelemetry mode for tracing, specify the `--trace-config` +flag as follows: + +``` +$ tritonserver --trace-config mode=opentelemetry \ + --trace-config opentelemetry,url= ... +``` + +Triton's OpenTelemetry trace mode uses +[Batch Span Processor](https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/#batch-span-processor), +which batches ended spans and sends them in bulk. Batching helps +with data compression and reduces the number of outgoing connections +required to transmit the data. This processor supports both size and +time based batching. Size-based batching is controlled by 2 parameters: +`bsp_max_export_batch_size` and `bsp_max_queue_size`, while time-based batching +is controlled by `bsp_schedule_delay`. Collected spans will be exported when +the batch size reaches `bsp_max_export_batch_size`, or delay since last export +reaches `bsp_schedule_delay`, whatever comes first. Additionally, user should +make sure that `bsp_max_export_batch_size` is always less than +`bsp_max_queue_size`, otherwise the excessive spans will be dropped +and trace data will be lost. + +Default parameters for the Batch Span Processor are provided in +[`OpenTelemetry trace APIs settings`](#opentelemetry-trace-apis-settings). +As a general recommendation, make sure that `bsp_max_queue_size` is large enough +to hold all collected spans, and `bsp_schedule_delay` does not cause frequent +exports, which will affect Triton Server's latency. A minimal Triton trace +consists of 3 spans: top level span, model span, and compute span. + +* __Top level span__: The top-level span collects timestamps for when +request was received by Triton, and when the response was sent. Any Triton +trace contains only 1 top level span. +* __Model span__: Model spans collect information, when request for +this model was started, when it was placed in a queue, and when it was ended. +A minimal Triton trace contains 1 model span. +* __Compute span__: Compute spans record compute timestamps. A minimal +Triton trace contains 1 compute span. + +The total amount of spans depends on the complexity of your model. +A general rule is any base model - a single model that performs computations - +produces 1 model span and one compute span. For ensembles, every composing +model produces model and compute spans in addition to one model span for the +ensemble. [BLS](#tracing-for-bls-models) models produce the same number of +model and compute spans as the total amount of models involved in the BLS request, +including the main BLS model. + + +### Differences in trace contents from Triton's trace [output](#json-trace-output) + +OpenTelemetry APIs produce [spans](https://opentelemetry.io/docs/concepts/observability-primer/#spans) +that collect the same timestamps as Triton's Trace +APIs. Each span also includes `model_name`, `model_version`, `request_id`, +and `parent_id` as an [attribute](https://opentelemetry.io/docs/concepts/observability-primer/#span-attributes). + +The span collects `TIMESTAMPS` that consist of a name and a timestamp +in nanoseconds, which is similar to Triton Trace APIs. However, +OpenTelemetry relies on the system's clock for event timestamps, which is based +on the system's real-time clock. On the other hand, Triton Trace APIs +report timestamps using steady clock, which is a monotonic clock that ensures +time always movess forward. This clock is not related to wall clock time +and, for example, can measure time since last reboot. + + +### OpenTelemetry trace APIs settings + +The following table shows available OpenTelemetry trace APIs settings for +`--trace-config opentelemetry,=`. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SettingDefault ValueDescription
urlhttp://localhost:4318/v1/traces + host:port to which the receiver is going to receive + trace data. +
resourceservice.name=triton-inference-server + Key-value pairs to be used as resource attributes.
+ Should be specified following the provided template:
+ --trace-config opentelemetry,resource=<key>=<value>
+ For example:
+ --trace-config opentelemetry,resource=service.name=triton
+ --trace-config opentelemetry,resource=service.version=1
+ Alternatively, key-value attributes can be specified through
+ + OTEL_RESOURCE_ATTRIBUTES + environment variable. +
+ Batch Span Processor +
bsp_max_queue_size2048 + Maximum queue size.
+ This setting can also be specified through
+ + OTEL_BSP_MAX_QUEUE_SIZE + environment variable. +
bsp_schedule_delay5000 + Delay interval (in milliseconds) between two consecutive exports.
+ This setting can also be specified through
+ + OTEL_BSP_SCHEDULE_DELAY + environment variable. +
bsp_max_export_batch_size512 + Maximum batch size. Must be less than or equal to + bsp_max_queue_size.
+ This setting can also be specified through
+ + OTEL_BSP_MAX_EXPORT_BATCH_SIZE + environment variable. +
+ +### OpenTelemetry Context Propagation + +Triton supports [context propagation](https://opentelemetry.io/docs/concepts/context-propagation/) +in OpenTelemetry mode starting in version 24.01. Note, that every request +with propagated OpenTelemetry context will be traced, regardless of `rate` and +`count` trace settings. If a user wishes to trace only those requests, for which +OpenTelemetry context was injected on the client side, please start Triton with +`--trace-config rate=0`: +``` +$ tritonserver \ + --trace-config rate=0 \ + --trace-config level=TIMESTAMPS \ + --trace-config count=-1 \ + --trace-config mode=opentelemetry +``` +Please, be aware that this option is subject to change in future releases. + +#### How to inject OpenTelemetry context on the client side + +For C++ clients, please refer to [gRPC](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/examples/grpc/README.md) +and [HTTP](https://github.com/open-telemetry/opentelemetry-cpp/blob/main/examples/http/README.md) +examples. + +For python clients, please make sure to install +[OpenTelemetry Python](https://github.com/open-telemetry/opentelemetry-python/tree/main?tab=readme-ov-file#install). +You can then use the `opentelemetry.propagate.inject` method to prepare headers to +pass with the request, as shown [here](https://github.com/open-telemetry/opentelemetry-python/blob/main/docs/examples/auto-instrumentation/client.py#L37-L41). +Then, you can specify headers in the `infer` method. For references, please +look at our [tests](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py), +e.g. [http context propagation test](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py#L494-L508). + +### Custom Backend Tracing + +In the case when a custom activity needs to be traced in the backend, please +use `TRITONSERVER_InferenceTraceReportActivity` API. For examples, please +refer to the [identity backend](https://github.com/triton-inference-server/identity_backend/blob/main/src/identity.cc). + +In `openTelemetry` trace mode, if one wishes to start a new span, make sure +that the name of your custom activity ends with `_START`. To end the new span, +make sure that corresponding activity ends with `_END`. For example, in the +identity backend, we start a `CUSTOM_ACTIVITY` span, by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L872-L876) +`CUSTOM_ACTIVITY_START` event; and we close this span by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L880-L883) +`CUSTOM_ACTIVITY_END` event. + +Please note, that it is user's responsibility to make sure that all custom started +spans are properly ended. + +### Limitations + +- OpenTelemetry trace mode is not supported on Windows systems. + +- Triton supports only +[OTLP/HTTP Exporter](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/otlp.md#otlphttp) +and allows specification of only url for this exporter through +`--trace-config`. Other options and corresponding default values can be +found [here](https://github.com/open-telemetry/opentelemetry-cpp/tree/v1.8.3/exporters/otlp#configuration-options--otlp-http-exporter-). + +- Triton does not support configuration of the opentelemetry trace settings +during a Triton run and opentelemetry specific settings are not available +for the retrieval through [Triton's trace extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_trace.md). diff --git a/docs/user_guide/v1_to_v2.md b/docs/user_guide/v1_to_v2.md new file mode 100644 index 0000000000..d9da6f6cf8 --- /dev/null +++ b/docs/user_guide/v1_to_v2.md @@ -0,0 +1,68 @@ + + +# Version 1 to Version 2 Migration + +Version 2 of Triton does not generally maintain backwards +compatibility with version 1. Specifically, you should take the +following items into account when transitioning from version 1 to +version 2. + +* The Triton executables and libraries are in /opt/tritonserver. The + Triton executable is /opt/tritonserver/bin/tritonserver. + +* Some *tritonserver* command-line arguments are removed, changed or + have different default behavior in version 2. + + * --api-version, --http-health-port, --grpc-infer-thread-count, + --grpc-stream-infer-thread-count,--allow-poll-model-repository, --allow-model-control + and --tf-add-vgpu are removed. + + * The default for --model-control-mode is changed to *none*. + + * --tf-allow-soft-placement and --tf-gpu-memory-fraction are renamed + to --backend-config="tensorflow,allow-soft-placement=\" + and --backend-config="tensorflow,gpu-memory-fraction=\". + +* The HTTP/REST and GRPC protocols, while conceptually similar to + version 1, are completely changed in version 2. See [inference + protocols](../customization_guide/inference_protocols.md) for more information. + +* Python and C++ client libraries are re-implemented to match the new + HTTP/REST and GRPC protocols. The Python client no longer depends on + a C++ shared library and so should be usable on any platform that + supports Python. See [client + libraries](https://github.com/triton-inference-server/client) for + more information. + +* Building Triton has changed significantly in version 2. See + [build](../customization_guide/build.md) for more information. + +* In the Docker containers the environment variables indicating the + Triton version have changed to have a TRITON prefix, for example, + TRITON_SERVER_VERSION. diff --git a/nvidia_entrypoint.sh b/nvidia_entrypoint.sh deleted file mode 100755 index fb3ea26de3..0000000000 --- a/nvidia_entrypoint.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -set -e -cat </dev/null) " == " " ]]; then - echo - echo "WARNING: The NVIDIA Driver was not detected. GPU functionality will not be available." - echo " Use 'nvidia-docker run' to start this container; see" - echo " https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker ." -else - ( /usr/local/bin/checkSMVER.sh ) - DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true) - if [[ ! "$DRIVER_VERSION" =~ ^[0-9]*.[0-9]*$ ]]; then - echo "Failed to detect NVIDIA driver version." - elif [[ "${DRIVER_VERSION%.*}" -lt "${CUDA_DRIVER_VERSION%.*}" ]]; then - if [[ "${_CUDA_COMPAT_STATUS}" == "CUDA Driver OK" ]]; then - echo - echo "NOTE: Legacy NVIDIA Driver detected. Compatibility mode ENABLED." - else - echo - echo "ERROR: This container was built for NVIDIA Driver Release ${CUDA_DRIVER_VERSION%.*} or later, but" - echo " version ${DRIVER_VERSION} was detected and compatibility mode is UNAVAILABLE." - echo - echo " [[${_CUDA_COMPAT_STATUS}]]" - sleep 2 - fi - fi -fi - -if [[ "$(df -k /dev/shm |grep ^shm |awk '{print $2}') " == "65536 " ]]; then - echo - echo "NOTE: The SHMEM allocation limit is set to the default of 64MB. This may be" - echo " insufficient for the inference server. NVIDIA recommends the use of the following flags:" - echo " nvidia-docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 ..." -fi - -echo - -if [[ $# -eq 0 ]]; then - exec "/bin/bash" -else - exec "$@" -fi diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..2843ad2d42 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,51 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +[tool.codespell] +# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - +# this is only to allow you to run codespell interactively +skip = "./.git,./.github" +# ignore short words, and typename parameters like OffsetT +ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" +# ignore allowed words +ignore-words-list = "passin" +# use the 'clear' dictionary for unambiguous spelling mistakes +builtin = "clear" +# disable warnings about binary files and wrong encoding +quiet-level = 3 + +[tool.isort] +profile = "black" +use_parentheses = true +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +ensure_newline_before_comments = true +line_length = 88 +balanced_wrapping = true +indent = " " +skip = ["build"] + diff --git a/qa/L0_async_work_queue/test.sh b/qa/L0_async_work_queue/test.sh new file mode 100755 index 0000000000..a6d09264f2 --- /dev/null +++ b/qa/L0_async_work_queue/test.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TEST_LOG="./async_work_queue.log" +ASYNC_WORK_QUEUE_TEST=./async_work_queue_test + +RET=0 + +export CUDA_VISIBLE_DEVICES=0 + +rm -f TEST_LOG + +set +e +$ASYNC_WORK_QUEUE_TEST >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_backend_bls/test.sh b/qa/L0_backend_bls/test.sh new file mode 100755 index 0000000000..0db3931626 --- /dev/null +++ b/qa/L0_backend_bls/test.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +TRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG:="main"} +TRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG:="main"} +TRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG:="main"} + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 + +# Backend build requires recent version of CMake (FetchContent required) +# Using CMAKE installation instruction from:: https://apt.kitware.com/ +apt update -q=2 \ + && apt install -y gpg wget \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && . /etc/os-release \ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && apt-get update -q=2 \ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* \ + rapidjson-dev +cmake --version + +rm -fr *.log ./backend + +git clone --single-branch --depth=1 -b $TRITON_BACKEND_REPO_TAG \ + ${TRITON_REPO_ORGANIZATION}/backend.git + +(cd backend/examples/backends/bls && + mkdir build && + cd build && + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + .. && + make -j4 install) + +rm -fr /opt/tritonserver/backends/bls +cp -r backend/examples/backends/bls/build/install/backends/bls /opt/tritonserver/backends/. + +SERVER_ARGS="--model-repository=`pwd`/backend/examples/model_repos/bls_models --log-verbose=1" +SERVER_LOG="./inference_server.log" +CLIENT_LOG="./client.log" + +mkdir `pwd`/backend/examples/model_repos/bls_models/bls_fp32/1/ + +# Run the server with all the required models. +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +backend/examples/clients/bls_client >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo "Failed: Client test had a non-zero return code." + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** bls_test.py FAILED. \n***" + cat $CLIENT_LOG + cat $SERVER_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_backend_config/test.sh b/qa/L0_backend_config/test.sh new file mode 100755 index 0000000000..dad586883b --- /dev/null +++ b/qa/L0_backend_config/test.sh @@ -0,0 +1,427 @@ +#!/bin/bash +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Parses default-max-batch-size log record +# +# Example log record: +# I0521 02:12:37.402353 161 backend_model.cc:503] "Adding default backend config setting: default-max-batch-size,4 +parse_default_max_batch_size() { + echo $(python3 -c "print('$1'.split(',')[1].strip('\"'))") +} + +# Returns backend configuration json +# message from server log file path +# +# Example: config_map = $(get_config_map server.log) +get_config_map() { + BACKEND_CONFIG_MAP=$(grep "backend configuration:" $1) + echo $(python3 -c "backend_config='$BACKEND_CONFIG_MAP'.split('] \"backend configuration:\n')[1].rstrip('\"');print(backend_config)") +} + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +rm -rf ./models/ +mkdir -p ./models/no_config +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/savedmodel_float32_float32_float32/1 ./models/no_config/ + + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_TIMEOUT=20 +source ../common/util.sh + +SERVER_LOG_BASE="./inference_server" +rm -f $SERVER_LOG_BASE* +rm -f *.out + +COMMON_ARGS="--model-repository=`pwd`/models --strict-model-config=false --log-verbose=1 " + +NEGATIVE_PARSE_ARGS=("--backend-config=,default-max-batch-size=3 $COMMON_ARGS" \ + "--backend-config=default-max-batch-size= $COMMON_ARGS" \ + "--backend-config=default-max-batch-size $COMMON_ARGS" \ + "--backend-config=tensorflow,default-max-batch-size= $COMMON_ARGS" \ + "--backend-config=tensorflow,default-max-batch-size $COMMON_ARGS" \ +) + +POSITIVE_DEFAULT_ARGS=$COMMON_ARGS +POSITIVE_TEST_ARGS=("--backend-config=tensorflow,default-max-batch-size=5 $COMMON_ARGS" \ + "--backend-config=default-max-batch-size=6 $COMMON_ARGS" \ + "--backend-config=default-max-batch-size=7 --backend-config=tensorflow,default-max-batch-size=8 $COMMON_ARGS" \ +) + +# These integers correspond to the expected default-max-batch-size which gets set +# in the POSITIVE_TEST_ARGS +POSITIVE_TEST_ANSWERS=(5 6 8) + +RET=0 +# Positive tests +SERVER_ARGS=$POSITIVE_DEFAULT_ARGS +SERVER_LOG=$SERVER_LOG_BASE.backend_config_positive_default.log +run_server + +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + +else + kill $SERVER_PID + wait $SERVER_PID + + RESULT_LOG_LINE=$(grep -a "Adding default backend config setting:" $SERVER_LOG) + if [ "$RESULT_LOG_LINE" != "" ]; then + + # Pick out the logged value of the default-max-batch-size which gets passed into model creation + RESOLVED_DEFAULT_MAX_BATCH_SIZE=$(parse_default_max_batch_size "${RESULT_LOG_LINE}") + + if [ "$RESOLVED_DEFAULT_MAX_BATCH_SIZE" != "4" ]; then + echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: default-max-batch-size,4, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" + RET=1 + fi + else + echo "*** FAILED: No log statement stating default max batch size\n" + RET=1 + fi +fi + +for ((i=0; i < ${#POSITIVE_TEST_ARGS[@]}; i++)); do + SERVER_ARGS=${POSITIVE_TEST_ARGS[$i]} + SERVER_LOG=$SERVER_LOG_BASE.backend_config_positive_$i.log + run_server + + if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + + else + kill $SERVER_PID + wait $SERVER_PID + + RESULT_LOG_LINE=$(grep -a "Found overwritten default setting:" $SERVER_LOG) + if [ "$RESULT_LOG_LINE" != "" ]; then + + # Pick out the logged value of the default-max-batch-size which gets passed into model creation + RESOLVED_DEFAULT_MAX_BATCH_SIZE=$(parse_default_max_batch_size "${RESULT_LOG_LINE}") + + if [ "$RESOLVED_DEFAULT_MAX_BATCH_SIZE" != "${POSITIVE_TEST_ANSWERS[$i]}" ]; then + echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: ${POSITIVE_TEST_ANSWERS[$i]}, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" + RET=1 + fi + else + echo "*** FAILED: No log statement stating default max batch size\n" + RET=1 + fi + fi +done + +# Negative tests +# Failing because the syntax is incorrect +for ((i=0; i < ${#NEGATIVE_PARSE_ARGS[@]}; i++)); do + SERVER_ARGS=${NEGATIVE_PARSE_ARGS[$i]} + SERVER_LOG=$SERVER_LOG_BASE.backend_config_negative_parse$i.log + run_server + + if [ "$SERVER_PID" == "0" ]; then + if ! grep -e "--backend-config option format is" $SERVER_LOG; then + echo -e "*** FAILED: Expected invalid backend config parse message but found other error.\n" + RET=1 + fi + else + echo -e "*** FAILED: Expected server to exit with error, but found running.\n" + RET=1 + kill $SERVER_PID + wait $SERVER_PID + fi +done + + +# +# Specific backend tests +# + +# While inference server is running, save the +# config of the 'no_config' model to the TRIAL +# file. +function save_model_config() { + CODE=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/no_config/config` + set -e + if [ "$CODE" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi +} + +# Tensorflow 1: Batching ON +rm -rf ./models/ +mkdir -p ./models/no_config +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/savedmodel_float32_float32_float32/1 ./models/no_config/ + +SERVER_ARGS="--backend-config=tensorflow,default-max-batch-size=5 $COMMON_ARGS" +SERVER_LOG=$SERVER_LOG_BASE.backend_config_tensorflow_batch_5.log +run_server + +TRIAL=tensorflow_batching_on +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 +else + save_model_config + + # Assert the max-batch-size is the command line value + MAX_BATCH_LOG_LINE=$(grep -a "\"max_batch_size\":5" $TRIAL.out) + if [ "$MAX_BATCH_LOG_LINE" == "" ]; then + cat $TRIAL.out + echo "*** FAILED: Expected max batch size to be 5 but found: $MAX_BATCH_LOG_LINE\n" + RET=1 + fi + + # Assert we are also turning on the dynamic_batcher + DYNAMIC_BATCHING_LOG_LINE=$(grep -a "Starting dynamic-batcher thread" $SERVER_LOG) + if [ "$DYNAMIC_BATCHING_LOG_LINE" == "" ]; then + echo "*** FAILED: Expected dynamic batching to be set in model config but was not found\n" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + +fi + +# Tensorflow 1: Batching OFF +SERVER_ARGS="--backend-config=tensorflow,default-max-batch-size=0 $COMMON_ARGS" +SERVER_LOG=$SERVER_LOG_BASE.backend_config_tensorflow_batch_0.log +run_server + +TRIAL=tensorflow_batching_off +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + +else + save_model_config + + # Assert the max-batch-size is 0 in the case batching is supported + # in the model but not in the config. + MAX_BATCH_LOG_LINE=$(grep -a "\"max_batch_size\":0" $TRIAL.out) + if [ "$MAX_BATCH_LOG_LINE" == "" ]; then + echo "*** FAILED: Expected max batch size to be 0 but found: $MAX_BATCH_LOG_LINE\n" + RET=1 + fi + + # Assert batching disabled + if [ "$(grep -a -E '\"dynamic_batching\": \{}' $SERVER_LOG)" != "" ]; then + echo "*** FAILED: Found dynamic batching enabled in configuration when none expected.\n" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + +fi + +# Onnxruntime: Batching ON +rm -rf ./models/ +mkdir -p ./models/no_config +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/onnx_float32_float32_float32/1 ./models/no_config/ + +SERVER_ARGS="--backend-config=onnxruntime,default-max-batch-size=5 $COMMON_ARGS" +SERVER_LOG=$SERVER_LOG_BASE.backend_config_onnxruntime_batch_5.log +run_server + +TRIAL=onnxruntime_batching_on +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + +else + save_model_config + + # Assert the max-batch-size is the command line value + MAX_BATCH_LOG_LINE=$(grep -a "\"max_batch_size\":5" $TRIAL.out) + if [ "$MAX_BATCH_LOG_LINE" == "" ]; then + echo "*** FAILED: Expected max batch size to be 5 but found: $MAX_BATCH_LOG_LINE\n" + RET=1 + fi + + # Assert we are also turning on the dynamic_batcher + DYNAMIC_BATCHING_LOG_LINE=$(grep -a "Starting dynamic-batcher thread" $SERVER_LOG) + if [ "$DYNAMIC_BATCHING_LOG_LINE" == "" ]; then + echo "*** FAILED: Expected dynamic batching to be set in model config but was not found\n" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID +fi + +# Onnxruntime: Batching OFF +rm -rf ./models/ +mkdir -p ./models/no_config +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/onnx_float32_float32_float32/1 ./models/no_config/ + +SERVER_ARGS="--backend-config=onnxruntime,default-max-batch-size=0 $COMMON_ARGS" +SERVER_LOG=$SERVER_LOG_BASE.backend_config_onnxruntime_batch_0.log +run_server + +TRIAL=onnxruntime_batching_off +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + +else + save_model_config + + # Assert the max-batch-size is 0 in the case batching is supported + # in the model but not in the config. + MAX_BATCH_LOG_LINE=$(grep -a "\"max_batch_size\":0" $TRIAL.out) + if [ "$MAX_BATCH_LOG_LINE" == "" ]; then + echo "*** FAILED: Expected max batch size to be 0 but found: $MAX_BATCH_LOG_LINE\n" + RET=1 + fi + + # Assert batching disabled + if [ "$(grep -a -E '\"dynamic_batching\": \{}' $SERVER_LOG)" != "" ]; then + echo "*** FAILED: Found dynamic batching in configuration when none expected.\n" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + +fi + +# +# General backend tests +# + +# We want to make sure that backend configurations +# are not lost. For this purpose we are using only onnx backend + +rm -rf ./models/ +mkdir -p ./models/no_config/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/onnx_float32_float32_float32/1 ./models/no_config/ + +# First getting a baseline for the number of default configs +# added during a server set up +SERVER_ARGS="$COMMON_ARGS" +SERVER_LOG=$SERVER_LOG_BASE.default_configs.log +run_server + +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + +else + # Count number of default configs + BACKEND_CONFIG_MAP=$(get_config_map $SERVER_LOG) + DEFAULT_CONFIG_COUNT=$(echo $BACKEND_CONFIG_MAP | jq -r | jq '.["cmdline"]' | jq length) + if [ $DEFAULT_CONFIG_COUNT -lt 4 ]; then + echo "*** FAILED: Expected number of default configs to be at least 4 but found: $DEFAULT_CONFIG_COUNT\n" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + +fi + +# Now make sure that when setting specific backend configs +# default ones are not lost. +# Current logic for backend config resolution reads default configs first, +# then specific configs and overrides defaults if needed. +# We would like to make sure that none of configs are lost and +# defaults are properly overridden. +# One of defaultconfigs is `min-compute-capability`. This test +# checks if it is properlly overridden. +MIN_COMPUTE_CAPABILITY=XX +SERVER_ARGS="--backend-config=onnxruntime,min-compute-capability=$MIN_COMPUTE_CAPABILITY $COMMON_ARGS" +SERVER_LOG=$SERVER_LOG_BASE.global_configs.log +run_server + +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + +else + # Count number of default configs + BACKEND_CONFIG_MAP=$(get_config_map $SERVER_LOG) + CONFIG_VALUE=$(echo $BACKEND_CONFIG_MAP | jq -r | jq '.["cmdline"]' | jq -r '.["min-compute-capability"]') + + if [ $CONFIG_VALUE != $MIN_COMPUTE_CAPABILITY ]; then + echo "*** FAILED: Expected min-compute-capability config to be $MIN_COMPUTE_CAPABILITY but found: $CONFIG_VALUE\n" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + +fi +# Now make sure that specific backend configs are not lost. +SERVER_ARGS="--backend-config=onnxruntime,a=0 --backend-config=onnxruntime,y=0 --backend-config=onnxruntime,z=0 $COMMON_ARGS" +SERVER_LOG=$SERVER_LOG_BASE.specific_configs.log +EXPECTED_CONFIG_COUNT=$(($DEFAULT_CONFIG_COUNT+3)) +run_server + +if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: Server failed to start $SERVER\n" + RET=1 + +else + # Count number of default configs + BACKEND_CONFIG_MAP=$(get_config_map $SERVER_LOG) + TOTAL_CONFIG_COUNT=$(echo $BACKEND_CONFIG_MAP | jq -r | jq '.["cmdline"]' | jq 'length') + + if [ $TOTAL_CONFIG_COUNT -ne $EXPECTED_CONFIG_COUNT ]; then + echo "*** FAILED: Expected number of backend configs to be $EXPECTED_CONFIG_COUNT but found: $TOTAL_CONFIG_COUNT\n" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + +fi + + +# Print test outcome +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET + diff --git a/qa/L0_backend_fastertransformer/test.sh b/qa/L0_backend_fastertransformer/test.sh new file mode 100755 index 0000000000..7491bdd761 --- /dev/null +++ b/qa/L0_backend_fastertransformer/test.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +FASTERTRANSFORMER_BRANCH_TAG=${FASTERTRANSFORMER_BRANCH_TAG:="main"} +FASTERTRANSFORMER_BRANCH=${FASTERTRANSFORMER_BRANCH:="https://github.com/triton-inference-server/fastertransformer_backend.git"} +SERVER_TIMEOUT=600 +SERVER_LOG="$PWD/inference_server" +CLIENT_LOG="$PWD/client" + +MODEL_DIR=${MODEL_DIR:=$PWD/fastertransformer_backend/all_models/t5/} +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +SERVER_ARGS_EXTRA="--exit-timeout-secs=${SERVER_TIMEOUT} --backend-directory=${BACKEND_DIR}" +SERVER_ARGS="--model-repository=${MODEL_DIR} ${SERVER_ARGS_EXTRA}" +source ../common/util.sh + +rm -f $SERVER_LOG* $CLIENT_LOG* + +RET=0 +# install dependencies +apt-get update && \ + apt-get install -y --no-install-recommends python3 python3-pip python3-protobuf +python3 -m pip install --upgrade pip && \ + pip3 install --upgrade "numpy<2" + +# install client libraries +pip3 install tritonclient[all] + +# Clone repo +git clone --single-branch --depth=1 -b ${FASTERTRANSFORMER_BRANCH_TAG} ${FASTERTRANSFORMER_BRANCH} +cd fastertransformer_backend + +run_server + +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python3 tools/issue_request.py tools/requests/sample_request_single_t5.json >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 +fi + +kill_server + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $SERVER_LOG + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_backend_identity/identity_test.py b/qa/L0_backend_identity/identity_test.py new file mode 100755 index 0000000000..a607e4189b --- /dev/null +++ b/qa/L0_backend_identity/identity_test.py @@ -0,0 +1,298 @@ +#!/usr/bin/python + +# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys +from builtins import range + +import numpy as np +import requests as httpreq +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", "--url", type=str, required=False, help="Inference server URL." + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + + FLAGS = parser.parse_args() + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + + if FLAGS.url is None: + FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" + + # Run async requests to make sure backend handles request batches + # correctly. We use just HTTP for this since we are not testing the + # protocol anyway. + if FLAGS.protocol == "http": + model_name = "identity_uint32" + request_parallelism = 4 + shape = [2, 2] + with client_util.InferenceServerClient( + FLAGS.url, concurrency=request_parallelism, verbose=FLAGS.verbose + ) as client: + input_datas = [] + requests = [] + for i in range(request_parallelism): + input_data = (16384 * np.random.randn(*shape)).astype(np.uint32) + input_datas.append(input_data) + inputs = [ + client_util.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + requests.append(client.async_infer(model_name, inputs)) + + for i in range(request_parallelism): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + results = requests[i].get_result() + print(results) + + output_data = results.as_numpy("OUTPUT0") + if output_data is None: + print("error: expected 'OUTPUT0'") + sys.exit(1) + + if not np.array_equal(output_data, input_datas[i]): + print( + "error: expected output {} to match input {}".format( + output_data, input_datas[i] + ) + ) + sys.exit(1) + + # Make sure the requests ran in parallel. + stats = client.get_inference_statistics(model_name) + if (len(stats["model_stats"]) != 1) or ( + stats["model_stats"][0]["name"] != model_name + ): + print("error: expected statistics for {}".format(model_name)) + sys.exit(1) + + stat = stats["model_stats"][0] + if (stat["inference_count"] != 8) or (stat["execution_count"] != 1): + print( + "error: expected execution_count == 1 and inference_count == 8, got {} and {}".format( + stat["execution_count"], stat["inference_count"] + ) + ) + sys.exit(1) + + # Check metrics to make sure they are reported correctly + metrics = httpreq.get("http://localhost:8002/metrics") + print(metrics.text) + + success_str = ( + 'nv_inference_request_success{model="identity_uint32",version="1"}' + ) + infer_count_str = 'nv_inference_count{model="identity_uint32",version="1"}' + infer_exec_str = ( + 'nv_inference_exec_count{model="identity_uint32",version="1"}' + ) + custom_metric_str = ( + 'input_byte_size_counter{model="identity_uint32",version="1"}' + ) + + success_val = None + infer_count_val = None + infer_exec_val = None + custom_metric_val = None + for line in metrics.text.splitlines(): + if line.startswith(success_str): + success_val = float(line[len(success_str) :]) + if line.startswith(infer_count_str): + infer_count_val = float(line[len(infer_count_str) :]) + if line.startswith(infer_exec_str): + infer_exec_val = float(line[len(infer_exec_str) :]) + if line.startswith(custom_metric_str): + custom_metric_val = float(line[len(custom_metric_str) :]) + + if success_val != 4: + print( + "error: expected metric {} == 4, got {}".format( + success_str, success_val + ) + ) + sys.exit(1) + if infer_count_val != 8: + print( + "error: expected metric {} == 8, got {}".format( + infer_count_str, infer_count_val + ) + ) + sys.exit(1) + if infer_exec_val != 1: + print( + "error: expected metric {} == 1, got {}".format( + infer_exec_str, infer_exec_val + ) + ) + sys.exit(1) + if custom_metric_val != 64: + print( + "error: expected metric {} == 64, got {}".format( + custom_metric_str, custom_metric_val + ) + ) + sys.exit(1) + + # Reuse a single client for all sync tests + with client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) as client: + for model_name, np_dtype, shape in ( + # yapf: disable + ("identity_fp32", np.float32, [1, 0]), + ("identity_fp32", np.float32, [1, 5]), + ("identity_uint32", np.uint32, [4, 0]), + ("identity_uint32", np.uint32, [8, 5]), + ("identity_nobatch_int8", np.int8, [0]), + ("identity_nobatch_int8", np.int8, [7]), + ("identity_bytes", object, [1, 1]), + ("identity_bf16", np.float32, [1, 0]), + ("identity_bf16", np.float32, [1, 5]) + ): + # yapf: enable + if np_dtype != object: + input_data = (16384 * np.random.randn(*shape)).astype(np_dtype) + else: + in0 = 16384 * np.ones(shape, dtype="int") + in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) + input_data = in0n.reshape(in0.shape) + if model_name != "identity_bf16": + triton_type = np_to_triton_dtype(input_data.dtype) + else: + triton_type = "BF16" + inputs = [client_util.InferInput("INPUT0", input_data.shape, triton_type)] + inputs[0].set_data_from_numpy(input_data) + + results = client.infer(model_name, inputs) + print(results) + + # Make sure outputs are expected value + output_data = results.as_numpy("OUTPUT0") + + if np_dtype == object: + output_data = np.array( + [str(x, encoding="utf-8") for x in output_data.flatten()], + dtype=object, + ).reshape(output_data.shape) + + if output_data is None: + print("error: expected 'OUTPUT0'") + sys.exit(1) + + if model_name == "identity_bf16": + if input_data.shape != output_data.shape: + print( + "error: expected output shape {} to match input shape {}".format( + output_data.shape, input_data.shape + ) + ) + sys.exit(1) + for input, output in zip( + np.nditer(input_data, flags=["refs_ok", "zerosize_ok"], order="C"), + np.nditer(output_data, flags=["refs_ok", "zerosize_ok"], order="C"), + ): + if input.tobytes()[2:4] != output.tobytes()[2:4]: + print( + "error: expected low-order bits of output {} to match low-order bits of input {}".format( + output, input + ) + ) + sys.exit(1) + if output.tobytes()[0:2] != b"\x00\x00": + print( + "error: expected output {} to have all-zero high-order bits, got {}".format( + output, output.tobytes()[0:2] + ) + ) + sys.exit(1) + else: + if not np.array_equal(output_data, input_data): + print( + "error: expected output {} to match input {}".format( + output_data, input_data + ) + ) + sys.exit(1) + + # Make sure response parameters are correct + response = results.get_response() + if FLAGS.protocol == "http": + params = response["parameters"] + param0 = params["param0"] + param1 = params["param1"] + param2 = params["param2"] + param3 = params["param3"] + else: + params = response.parameters + param0 = params["param0"].string_param + param1 = params["param1"].int64_param + param2 = params["param2"].bool_param + param3 = params["param3"].double_param + + if param0 != "an example string parameter": + print("error: expected 'param0' == 'an example string parameter'") + sys.exit(1) + if param1 != 42: + print("error: expected 'param1' == 42") + sys.exit(1) + if param2 != False: + print("error: expected 'param2' == False") + sys.exit(1) + if param3 != 123.123: + print("error: expected 'param3' == 123.123") + sys.exit(1) diff --git a/qa/L0_backend_identity/test.sh b/qa/L0_backend_identity/test.sh new file mode 100755 index 0000000000..bd29951ba6 --- /dev/null +++ b/qa/L0_backend_identity/test.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_PY=./identity_test.py +CLIENT_LOG="./client.log" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/all_models --log-verbose=1" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -fr *.log ./all_models + +cp -r ./models ./all_models +cp -r ./models/identity_fp32 ./all_models/identity_bytes +(cd all_models/identity_bytes && \ + sed -i "s/^name:.*/name: \"identity_bytes\"/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_STRING/g" config.pbtxt) +cp -r ./models/identity_fp32 ./all_models/identity_nobatch_int8 +(cd all_models/identity_nobatch_int8 && \ + sed -i "s/^name:.*/name: \"identity_nobatch_int8\"/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 0/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_INT8/g" config.pbtxt) +cp -r ./models/identity_fp32 ./all_models/identity_uint32 +(cd all_models/identity_uint32 && \ + sed -i "s/^name:.*/name: \"identity_uint32\"/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_UINT32/g" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 3000000 }" >> config.pbtxt) +cp -r ./models/identity_fp32 ./all_models/identity_bf16 +(cd all_models/identity_bf16 && \ + sed -i "s/^name:.*/name: \"identity_bf16\"/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_BF16/g" config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +for PROTOCOL in http grpc; do + set +e + python $CLIENT_PY -i $PROTOCOL -v >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo "Failed: Client test had a non-zero return code." + RET=1 + fi + set -e +done + +kill $SERVER_PID +wait $SERVER_PID + +# Validate the byte_sizes reported by backend +OLDIFS=$IFS; IFS=',' +for i in "byte_size = 0, 8", \ + "byte_size = 7, 2", \ + "byte_size = 16, 6", \ + "byte_size = 20, 2", \ + "byte_size = 160, 2" \ + ; do set -- $i; \ + # $SERVER_LOG is recorded as a binary file. Using -a option + # to correctly grep the pattern in the server log. + if [[ $(cat $SERVER_LOG | grep -a $1 | wc -l) -ne $2 ]]; then + echo -e "\n***\n*** Test Failed $1 $2\n***" + RET=1 + fi +done +IFS=$OLDIFS + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $SERVER_LOG + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_backend_output_detail/test.sh b/qa/L0_backend_output_detail/test.sh new file mode 100755 index 0000000000..a8f4de59d1 --- /dev/null +++ b/qa/L0_backend_output_detail/test.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "No Repo version detected" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi +export CUDA_VISIBLE_DEVICES=0 + +rm -f *.log +MODELSDIR=`pwd`/models +rm -fr $MODELSDIR && mkdir -p $MODELSDIR/add_sub/1 && \ + cp ../python_models/add_sub/config.pbtxt $MODELSDIR/add_sub && \ + cp ../python_models/add_sub/model.py $MODELSDIR/add_sub/1 && \ + +source ../common/util.sh + +RET=0 + +TEST_LOG="./backend_output_detail_test.log" +TEST_EXEC=./backend_output_detail_test + +set +e +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Backend Output Detail Unit Test Failed\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py new file mode 100644 index 0000000000..e03b1878bc --- /dev/null +++ b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py @@ -0,0 +1,243 @@ +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class ArgumentValidationTest(unittest.TestCase): + def test_infer_request_args(self): + # Dummy arguments used in the tests. + inputs = [pb_utils.Tensor("INPUT0", np.asarray([1, 2], dtype=np.int32))] + model_name = "my_model" + requested_output_names = ["my_output"] + + # + # inputs field validation + # + + # Test list of None as inputs + with self.assertRaises(pb_utils.TritonModelException) as e: + pb_utils.InferenceRequest( + inputs=[None], + model_name=model_name, + requested_output_names=requested_output_names, + ) + + # Test None object as list of inputs + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + inputs=None, + model_name=model_name, + requested_output_names=requested_output_names, + ) + + # model_name validation + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + model_name=None, + inputs=inputs, + requested_output_names=requested_output_names, + ) + + # + # Requested output name validations + # + + # Test list of None objects as requested_output_names + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + requested_output_names=[None], inputs=inputs, model_name=model_name + ) + + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + requested_output_names=None, inputs=inputs, model_name=model_name + ) + + # Other arguments validation + + # correlation_id set to None + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + requested_output_names=requested_output_names, + inputs=inputs, + model_name=model_name, + correlation_id=None, + ) + + # correlation_id set to an integer + infer_request_test = pb_utils.InferenceRequest( + requested_output_names=requested_output_names, + inputs=inputs, + model_name=model_name, + correlation_id=5, + ) + self.assertIsInstance(infer_request_test.correlation_id(), int) + self.assertEqual(infer_request_test.correlation_id(), 5) + + # correlation_id set to string + infer_request_test = pb_utils.InferenceRequest( + requested_output_names=requested_output_names, + inputs=inputs, + model_name=model_name, + correlation_id="test_str_id-5", + ) + self.assertIsInstance(infer_request_test.correlation_id(), str) + self.assertEqual(infer_request_test.correlation_id(), "test_str_id-5") + + # correlation_id default + infer_request_test = pb_utils.InferenceRequest( + requested_output_names=requested_output_names, + inputs=inputs, + model_name=model_name, + ) + self.assertIsInstance(infer_request_test.correlation_id(), int) + self.assertEqual(infer_request_test.correlation_id(), 0) + + # request_id set to None + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + requested_output_names=requested_output_names, + inputs=inputs, + model_name=model_name, + request_id=None, + ) + + # model_version set to None + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + requested_output_names=requested_output_names, + inputs=inputs, + model_name=model_name, + model_version=None, + ) + + # flags set to None + with self.assertRaises(TypeError) as e: + pb_utils.InferenceRequest( + requested_output_names=requested_output_names, + inputs=inputs, + model_name=model_name, + flags=None, + ) + + # Empty lists should not raise an exception + pb_utils.InferenceRequest( + requested_output_names=[], inputs=[], model_name=model_name + ) + + def test_infer_response_args(self): + outputs = [pb_utils.Tensor("OUTPUT0", np.asarray([1, 2], dtype=np.int32))] + + # Test list of None object as output tensor + with self.assertRaises(pb_utils.TritonModelException) as e: + pb_utils.InferenceResponse(output_tensors=[None]) + + # Test None as output tensors + with self.assertRaises(TypeError) as e: + pb_utils.InferenceResponse(output_tensors=None) + + # This should not raise an exception + pb_utils.InferenceResponse(output_tensors=[]) + pb_utils.InferenceResponse(outputs) + + def test_tensor_args(self): + np_array = np.asarray([1, 2], dtype=np.int32) + + # Test None as tensor name + with self.assertRaises(TypeError) as e: + pb_utils.Tensor(None, np_array) + + # Test None as Numpy array + with self.assertRaises(TypeError) as e: + pb_utils.Tensor("OUTPUT0", None) + + # Test None as dlpack capsule + with self.assertRaises(pb_utils.TritonModelException) as e: + pb_utils.Tensor.from_dlpack("OUTPUT0", None) + + # Test empty string as tensor name (from_dlpack) + with self.assertRaises(pb_utils.TritonModelException) as e: + pb_utils.Tensor.from_dlpack("", None) + + # Test empty string as tensor name + with self.assertRaises(TypeError) as e: + pb_utils.Tensor("", None) + + def test_log_args(self): + logger = pb_utils.Logger + + # Test None as log level setting + with self.assertRaises(TypeError) as e: + logger.log("Invalid Level", None) + + # Test integer as log level setting + with self.assertRaises(TypeError) as e: + logger.log("Invalid Level", 1) + + # Test None as log info msg + with self.assertRaises(TypeError) as e: + logger.log_info(None) + + # Test None as log warning msg + with self.assertRaises(TypeError) as e: + logger.log_warn(None) + + # Test None as log error msg + with self.assertRaises(TypeError) as e: + logger.log_error(None) + + # Test None as log verbose msg + with self.assertRaises(TypeError) as e: + logger.log_verbose(None) + + # This should not raise an exception + logger.log("Level unspecified") + + +class TritonPythonModel: + """This model tests the Python API arguments to make sure invalid args are + rejected.""" + + def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + test = unittest.main("model", exit=False) + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) + return responses diff --git a/qa/L0_backend_python/argument_validation/models/argument_validation/config.pbtxt b/qa/L0_backend_python/argument_validation/models/argument_validation/config.pbtxt new file mode 100644 index 0000000000..4c02983319 --- /dev/null +++ b/qa/L0_backend_python/argument_validation/models/argument_validation/config.pbtxt @@ -0,0 +1,39 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "argument_validation" +backend: "python" +max_batch_size: 0 + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_backend_python/argument_validation/test.sh b/qa/L0_backend_python/argument_validation/test.sh new file mode 100755 index 0000000000..90cbef89b5 --- /dev/null +++ b/qa/L0_backend_python/argument_validation/test.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_PY=../test_infer_shm_leak.py +CLIENT_LOG="./arg_validation_client.log" +TEST_RESULT_FILE='test_results.txt' +SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./arg_validation_server.log" + +RET=0 +source ../../common/util.sh + +rm -fr *.log + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +export MODEL_NAME="argument_validation" +python3 -m pytest --junitxml="${MODEL_NAME}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** python_unittest.py FAILED. \n***" + RET=1 +fi +set -e + +kill_server + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Argument validation test FAILED. \n***" +else + echo -e "\n***\n*** Argument validation test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/async_execute/concurrency_test.py b/qa/L0_backend_python/async_execute/concurrency_test.py new file mode 100644 index 0000000000..bc5f31650b --- /dev/null +++ b/qa/L0_backend_python/async_execute/concurrency_test.py @@ -0,0 +1,161 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient + + +class ConcurrencyTest(unittest.TestCase): + def setUp(self): + # Initialize client + self._triton = grpcclient.InferenceServerClient("localhost:8001") + + def _generate_streaming_callback_and_response_pair(self): + response = [] # [{"result": result, "error": error}, ...] + + def callback(result, error): + response.append({"result": result, "error": error}) + + return callback, response + + # Helper for testing concurrent execution + def _concurrent_execute_requests(self, model_name, batch_size, number_of_requests): + delay_secs = 4 + shape = [batch_size, 1] + inputs = [grpcclient.InferInput("WAIT_SECONDS", shape, "FP32")] + inputs[0].set_data_from_numpy(np.full(shape, delay_secs, dtype=np.float32)) + + callback, response = self._generate_streaming_callback_and_response_pair() + self._triton.start_stream(callback) + for i in range(number_of_requests): + self._triton.async_stream_infer(model_name, inputs) + + # 2s for sending requests for processing and 2s for returning results. + wait_secs = 2 + delay_secs + 2 + time.sleep(wait_secs) + # Ensure the sleep is shorter than sequential processing delay. + sequential_min_delay = wait_secs * batch_size * number_of_requests + self.assertLessEqual(wait_secs, sequential_min_delay) + + # If executed sequentially, the results are not available yet, so concurrent + # execution is observed from seeing the correct responses. + self.assertEqual(len(response), number_of_requests) + for res in response: + self.assertEqual(res["result"].as_numpy("DUMMY_OUT").shape[0], batch_size) + self.assertIsNone(res["error"]) + + self._triton.stop_stream() + + # Test batched requests are executed concurrently + def test_concurrent_execute_single_request(self): + self._concurrent_execute_requests( + model_name="async_execute_decouple", batch_size=4, number_of_requests=1 + ) + + # Test multiple requests are executed concurrently + def test_concurrent_execute_multi_request(self): + self._concurrent_execute_requests( + model_name="async_execute_decouple", batch_size=1, number_of_requests=4 + ) + + # Test batched requests are executed concurrently via bls + def test_concurrent_execute_single_request_bls(self): + self._concurrent_execute_requests( + model_name="async_execute_decouple_bls", batch_size=4, number_of_requests=1 + ) + + # Test multiple requests are executed concurrently via bls + def test_concurrent_execute_multi_request_bls(self): + self._concurrent_execute_requests( + model_name="async_execute_decouple_bls", batch_size=1, number_of_requests=4 + ) + + # Test requests with a shorter duration should return first + def test_concurrent_execute_different_duration(self): + model_name = "async_execute_decouple" + callback, response = self._generate_streaming_callback_and_response_pair() + self._triton.start_stream(callback) + + # Send 2 requests / delays + shape = [1, 1] + for delay_secs in [10, 2]: + inputs = [grpcclient.InferInput("WAIT_SECONDS", shape, "FP32")] + inputs[0].set_data_from_numpy(np.full(shape, delay_secs, dtype=np.float32)) + self._triton.async_stream_infer(model_name, inputs) + time.sleep(2) # leave a gap after each inference + shape[0] += 1 # batch size to track request id + + # The last request executes for 2 secs, leave an additional 2 secs for sending + # the request and 2 secs for receiving its response. Since 2 secs has elapsed + # after sending the request, wait for another 4 secs. + time.sleep(4) + # The response of the last request should be available by now, while the first + # request executes for 10 secs and only 8 secs has elapsed, so its response + # should not be available by now. + self.assertEqual(len(response), 1) + self.assertEqual(response[0]["result"].as_numpy("DUMMY_OUT").shape[0], 2) + self.assertIsNone(response[0]["error"]) + + # The first request executes for 10 secs, leave an additional 2 secs for sending + # the request and 2 secs for receiving its response. Since 8 secs has elapsed + # after sending the request, wait for another 6 secs. + time.sleep(6) + # The response of the first request should be available by now. + self.assertEqual(len(response), 2) + self.assertEqual(response[1]["result"].as_numpy("DUMMY_OUT").shape[0], 1) + self.assertIsNone(response[1]["error"]) + + self._triton.stop_stream() + + # Test model exception handling + def test_model_raise_exception(self): + model_name = "async_execute_decouple" + delay_secs = -1 # model will raise exception + shape = [1, 1] + inputs = [grpcclient.InferInput("WAIT_SECONDS", shape, "FP32")] + inputs[0].set_data_from_numpy(np.full(shape, delay_secs, dtype=np.float32)) + + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("ValueError: wait_secs cannot be negative", server_log) + + callback, response = self._generate_streaming_callback_and_response_pair() + self._triton.start_stream(callback) + self._triton.async_stream_infer(model_name, inputs) + time.sleep(2) + self._triton.stop_stream() + + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertIn("ValueError: wait_secs cannot be negative", server_log) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/async_execute/test.sh b/qa/L0_backend_python/async_execute/test.sh new file mode 100755 index 0000000000..b52c2bffa5 --- /dev/null +++ b/qa/L0_backend_python/async_execute/test.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../../common/util.sh + +RET=0 + +# +# Test execution overlapping on the same instance +# +rm -rf models && mkdir models +mkdir -p models/async_execute_decouple/1 && \ + cp ../../python_models/async_execute_decouple/model.py models/async_execute_decouple/1 && \ + cp ../../python_models/async_execute_decouple/config.pbtxt models/async_execute_decouple +mkdir -p models/async_execute_decouple_bls/1 && \ + cp ../../python_models/async_execute_decouple_bls/model.py models/async_execute_decouple_bls/1 && \ + cp ../../python_models/async_execute_decouple_bls/config.pbtxt models/async_execute_decouple_bls + +TEST_LOG="concurrency_test.log" +SERVER_LOG="concurrency_test.server.log" +SERVER_ARGS="--model-repository=${MODELDIR}/async_execute/models --backend-directory=${BACKEND_DIR} --log-verbose=1" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=concurrency_test.report.xml concurrency_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** async execute concurrency test FAILED\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 1 ]; then + echo -e "\n***\n*** Async execute test FAILED\n***" +else + echo -e "\n***\n*** Async execute test Passed\n***" +fi +exit $RET diff --git a/qa/L0_backend_python/bls/bls_parameters_test.py b/qa/L0_backend_python/bls/bls_parameters_test.py new file mode 100755 index 0000000000..e8fe7dfa81 --- /dev/null +++ b/qa/L0_backend_python/bls/bls_parameters_test.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient +from tritonclient.utils import np_to_triton_dtype + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class TestBlsParameters(unittest.TestCase): + def test_bls_parameters(self): + model_name = "bls_parameters" + shape = [1] + num_params = 3 + + # Based on the num_params specified, the model will generate a JSON response + # containing all the supported parameter types for num_params times recursively. + # Make sure the model has at least num_params + 1 instances. + expected_params = {} + for i in range(1, num_params + 1): + expected_params["bool_" + str(i)] = bool(i) + expected_params["int_" + str(i)] = i + expected_params["str_" + str(i)] = str(i) + + with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client: + input_data = np.array([num_params], dtype=np.ubyte) + inputs = [ + grpcclient.InferInput( + "NUMBER_PARAMETERS", shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + outputs = [grpcclient.InferRequestedOutput("PARAMETERS_AGGREGATED")] + result = client.infer(model_name, inputs, outputs=outputs) + params_json = str( + result.as_numpy("PARAMETERS_AGGREGATED")[0], encoding="utf-8" + ) + + params = json.loads(params_json) + self.assertEqual(params, expected_params) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh new file mode 100755 index 0000000000..46d1f40818 --- /dev/null +++ b/qa/L0_backend_python/bls/test.sh @@ -0,0 +1,356 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_PY=../test_infer_shm_leak.py +CLIENT_LOG="./bls_client.log" +TEST_RESULT_FILE='test_results.txt' +source ../../common/util.sh + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server} + +RET=0 +rm -fr *.log ./models *.txt *.xml + +# FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU +if [[ ${TEST_WINDOWS} == 0 ]]; then + pip3 uninstall -y torch + pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html + + mkdir -p models/bls/1/ + cp ../../python_models/bls/model.py models/bls/1/ + cp ../../python_models/bls/config.pbtxt models/bls + + mkdir -p models/dlpack_add_sub/1/ + cp ../../python_models/dlpack_add_sub/model.py models/dlpack_add_sub/1/ + cp ../../python_models/dlpack_add_sub/config.pbtxt models/dlpack_add_sub + + mkdir -p models/bls_async/1/ + cp ../../python_models/bls_async/model.py models/bls_async/1/ + cp ../../python_models/bls_async/config.pbtxt models/bls_async + + mkdir -p models/bls_memory/1/ + cp ../../python_models/bls_memory/model.py models/bls_memory/1/ + cp ../../python_models/bls_memory/config.pbtxt models/bls_memory + + mkdir -p models/bls_memory_async/1/ + cp ../../python_models/bls_memory_async/model.py models/bls_memory_async/1/ + cp ../../python_models/bls_memory_async/config.pbtxt models/bls_memory_async + + mkdir -p models/add_sub/1/ + cp ../../python_models/add_sub/model.py models/add_sub/1/ + cp ../../python_models/add_sub/config.pbtxt models/add_sub + + mkdir -p models/execute_error/1/ + cp ../../python_models/execute_error/model.py models/execute_error/1/ + cp ../../python_models/execute_error/config.pbtxt models/execute_error + + mkdir -p models/identity_fp32/1/ + cp ../../python_models/identity_fp32/model.py models/identity_fp32/1/ + cp ../../python_models/identity_fp32/config.pbtxt models/identity_fp32 + + mkdir -p models/dlpack_identity/1/ + cp ../../python_models/dlpack_identity/model.py models/dlpack_identity/1/ + cp ../../python_models/dlpack_identity/config.pbtxt models/dlpack_identity + + cp -r ${DATADIR}/qa_sequence_implicit_model_repository/onnx_nobatch_sequence_int32/ ./models + + git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG + mkdir -p models/square_int32/1/ + cp python_backend/examples/decoupled/square_model.py models/square_int32/1/model.py + cp python_backend/examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt + + mkdir -p models/dlpack_square/1/ + cp ../../python_models/dlpack_square/model.py models/dlpack_square/1/ + cp ../../python_models/dlpack_square/config.pbtxt models/dlpack_square + + mkdir -p models/identity_fp32_timeout/1/ + cp ../../python_models/identity_fp32_timeout/model.py models/identity_fp32_timeout/1/ + cp ../../python_models/identity_fp32_timeout/config.pbtxt models/identity_fp32_timeout + + cp -r ${DATADIR}/qa_model_repository/libtorch_nobatch_float32_float32_float32/ ./models/libtorch_gpu && \ + sed -i 's/libtorch_nobatch_float32_float32_float32/libtorch_gpu/' models/libtorch_gpu/config.pbtxt && \ + echo "instance_group [ { kind: KIND_GPU} ]" >> models/libtorch_gpu/config.pbtxt + + cp -r ${DATADIR}/qa_model_repository/libtorch_nobatch_float32_float32_float32/ ./models/libtorch_cpu && \ + sed -i 's/libtorch_nobatch_float32_float32_float32/libtorch_cpu/' models/libtorch_cpu/config.pbtxt && \ + echo "instance_group [ { kind: KIND_CPU} ]" >> models/libtorch_cpu/config.pbtxt + + # Test with different sizes of CUDA memory pool + # TODO: Why 256 worked in place of 128, on decoupled data pipeline? + for CUDA_MEMORY_POOL_SIZE_MB in 64 256 ; do + CUDA_MEMORY_POOL_SIZE_BYTES=$((CUDA_MEMORY_POOL_SIZE_MB * 1024 * 1024)) + SERVER_ARGS="--model-repository=${MODELDIR}/bls/models --backend-directory=${BACKEND_DIR} --log-verbose=1 --cuda-memory-pool-byte-size=0:${CUDA_MEMORY_POOL_SIZE_BYTES}" + for TRIAL in non_decoupled decoupled ; do + export BLS_KIND=${TRIAL} + SERVER_LOG="./bls_${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.inference_server.log" + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do + export MODEL_NAME=${MODEL_NAME} + # Run with pytest to capture the return code correctly + pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 + EXIT_CODE=$? + if [ $EXIT_CODE -ne 0 ]; then + echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***" + RET=$EXIT_CODE + cat $SERVER_LOG + cat $CLIENT_LOG + fi + done + + kill_server + + set -e + + # Only check the timeout value if there is no error since the test + # may fail before the test_timeout case gets run. + if [ $RET -eq 0 ]; then + # Check for bls 'test_timeout' to ensure timeout value is being correctly passed + if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***" + cat $SERVER_LOG + RET=1 + fi + fi + + if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then + if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then + echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***" + cat $SERVER_LOG + RET=1 + fi + fi + done + done +fi + +SERVER_ARGS="--model-repository=${MODELDIR}/bls/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +# Test error handling when BLS is used in "initialize" or "finalize" function +ERROR_MESSAGE="BLS is only supported during the 'execute' function." + +rm -fr ./models +mkdir -p models/bls_init_error/1/ +cp ../../python_models/bls_init_error/model.py models/bls_init_error/1/ +cp ../../python_models/bls_init_error/config.pbtxt models/bls_init_error +SERVER_LOG="./bls_init_error_server.log" +# This variable is used to print out the correct server log for each sub-test. +SUB_TEST_RET=0 + +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + SUB_TEST_RET=1 + kill_server +else + if grep "$ERROR_MESSAGE" $SERVER_LOG; then + echo -e "Found \"$ERROR_MESSAGE\"" >> $CLIENT_LOG + else + echo -e "Not found \"$ERROR_MESSAGE\"" >> $CLIENT_LOG + RET=1 + SUB_TEST_RET=1 + fi +fi + +if [ $SUB_TEST_RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG +fi + +# FIXME: [DLIS-6122] Requires support for model load/unload +# Until we can simulate Ctrl^C bls_finialize_error will not pass. +if [[ ${TEST_WINDOWS} == 0 ]]; then + rm -fr ./models + mkdir -p models/bls_finalize_error/1/ + cp ../../python_models/bls_finalize_error/model.py models/bls_finalize_error/1/ + cp ../../python_models/bls_finalize_error/config.pbtxt models/bls_finalize_error/ + SERVER_LOG="./bls_finalize_error_server.log" + SUB_TEST_RET=0 + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + else + kill_server + + if grep "$ERROR_MESSAGE" $SERVER_LOG; then + echo -e "Found \"$ERROR_MESSAGE\"" >> $CLIENT_LOG + else + echo -e "Not found \"$ERROR_MESSAGE\"" >> $CLIENT_LOG + RET=1 + SUB_TEST_RET=1 + fi + + if [ $SUB_TEST_RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + fi + fi + + # Test model loading API with BLS + SUB_TEST_RET=0 + rm -fr ./models + mkdir -p models/bls_model_loading/1/ + cp ../../python_models/bls_model_loading/model.py models/bls_model_loading/1/ + cp ../../python_models/bls_model_loading/config.pbtxt models/bls_model_loading/ + cp -fr ${DATADIR}/qa_model_repository/onnx_int32_int32_int32 models/. + # Make only version 2, 3 is valid version directory + rm -rf models/onnx_int32_int32_int32/1 + + SERVER_LOG="./bls_model_loading_server.log" + SERVER_ARGS="--model-repository=${MODELDIR}/bls/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --log-verbose=1" + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + else + export MODEL_NAME='bls_model_loading' + + set +e + code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load` + set -e + if [ "$code" == "400" ]; then + echo -e "\n***\n*** Failed to load model '${MODEL_NAME}'\n***" + RET=1 + SUB_TEST_RET=1 + fi + + set +e + + python3 -m pytest --junitxml="${MODEL_NAME}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** 'bls_model_loading' test FAILED. \n***" + cat $CLIENT_LOG + RET=1 + SUB_TEST_RET=1 + fi + + set -e + + kill_server + + if [ $SUB_TEST_RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + fi + fi + + # Test model loading API with BLS warmup + (cd models/bls_model_loading && \ + echo "model_warmup [{" >> config.pbtxt && \ + echo " name : \"regular sample\"" >> config.pbtxt && \ + echo " batch_size: 1" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"INPUT0\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 4" >> config.pbtxt && \ + echo " zero_data: false" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"INPUT1\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 4" >> config.pbtxt && \ + echo " zero_data: false" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}]" >> config.pbtxt ) + + SUB_TEST_RET=0 + SERVER_LOG="./bls_model_loading_server_warmup.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + else + set +e + code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load` + set -e + if [ "$code" == "400" ]; then + echo -e "\n***\n*** Failed to load model '${MODEL_NAME}'\n***" + RET=1 + SUB_TEST_RET=1 + fi + + kill_server + + if [ $SUB_TEST_RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + fi + fi +fi + +# Test BLS parameters +rm -rf params_models && mkdir -p params_models/bls_parameters/1 +cp ../../python_models/bls_parameters/model.py ./params_models/bls_parameters/1 +cp ../../python_models/bls_parameters/config.pbtxt ./params_models/bls_parameters + +TEST_LOG="./bls_parameters.log" +SERVER_LOG="./bls_parameters.server.log" + +SERVER_ARGS="--model-repository=${MODELDIR}/bls/params_models --backend-directory=${BACKEND_DIR} --log-verbose=1" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 bls_parameters_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** bls_parameters_test.py FAILED. \n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill_server + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** BLS test PASSED. \n***" +else + echo -e "\n***\n*** BLS test FAILED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/common.sh b/qa/L0_backend_python/common.sh new file mode 100755 index 0000000000..ca1ae4a7bc --- /dev/null +++ b/qa/L0_backend_python/common.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +get_shm_pages() { + shm_pages=(`ls /dev/shm`) + echo ${#shm_pages[@]} +} + +install_conda() { + rm -rf ./miniconda + file_name="Miniconda3-py310_23.11.0-2-Linux-x86_64.sh" + wget https://repo.anaconda.com/miniconda/$file_name + + # install miniconda in silent mode + bash $file_name -p ./miniconda -b + + # activate conda + eval "$(./miniconda/bin/conda shell.bash hook)" +} + +install_build_deps() { + apt update && apt install software-properties-common rapidjson-dev -y + # Using CMAKE installation instruction from:: https://apt.kitware.com/ + apt update -q=2 \ + && apt install -y gpg wget \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && . /etc/os-release \ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && apt-get update -q=2 \ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* +} + +create_conda_env() { + local python_version=$1 + local env_name=$2 + conda create -n $env_name python=$python_version -y + conda activate $env_name + conda install -c conda-forge conda-pack -y +} + +create_conda_env_with_specified_path() { + local python_version=$1 + local env_path=$2 + conda create -p $env_path python=$python_version -y + conda activate $env_path + conda install -c conda-forge conda-pack -y +} + +create_python_backend_stub() { + rm -rf python_backend + git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG + (cd python_backend/ && mkdir builddir && cd builddir && \ + cmake -DTRITON_ENABLE_GPU=ON -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} -DTRITON_BACKEND_REPO_TAG=$TRITON_BACKEND_REPO_TAG -DTRITON_COMMON_REPO_TAG=$TRITON_COMMON_REPO_TAG -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG ../ && \ + make -j18 triton-python-backend-stub) +} diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh new file mode 100755 index 0000000000..9020c7ebfd --- /dev/null +++ b/qa/L0_backend_python/custom_metrics/test.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_PY=../test_infer_shm_leak.py +CLIENT_LOG="./custom_metrics_client.log" +TEST_RESULT_FILE='test_results.txt' +source ../../common/util.sh + +SERVER_ARGS="--model-repository=${MODELDIR}/custom_metrics/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./custom_metrics_server.log" + +RET=0 +rm -fr *.log ./models *.txt + +mkdir -p models/custom_metrics/1/ +cp ../../python_models/custom_metrics/model.py models/custom_metrics/1/ +cp ../../python_models/custom_metrics/config.pbtxt models/custom_metrics + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +export MODEL_NAME='custom_metrics' +python3 -m pytest --junitxml="${MODEL_NAME}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** 'Custom Metrics' test FAILED. \n***" + cat $CLIENT_LOG + RET=1 +fi + +set -e + +kill_server + + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Custom Metrics test FAILED. \n***" +else + echo -e "\n***\n*** Custom Metrics test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/decoupled/decoupled_test.py b/qa/L0_backend_python/decoupled/decoupled_test.py new file mode 100755 index 0000000000..45ce370fb1 --- /dev/null +++ b/qa/L0_backend_python/decoupled/decoupled_test.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 + +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../../common") + +import queue +import time +import unittest +from functools import partial + +import numpy as np +import shm_util +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class DecoupledTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + + def test_decoupled_execute_error(self): + # The decoupled_execute_error model returns an error for the first + # request and successfully processes the second request. This is making + # sure that an error in a single request does not completely fail the + # batch. + + model_name = "decoupled_execute_error" + shape = [2, 2] + number_of_requests = 2 + user_data = UserData() + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as triton_client: + triton_client.start_stream(callback=partial(callback, user_data)) + + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer( + model_name=model_name, inputs=inputs + ) + + for i in range(number_of_requests): + result = user_data._completed_requests.get() + if i == 0: + self.assertIs(type(result), InferenceServerException) + continue + + print(result) + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + self.assertTrue( + np.array_equal(output_data, input_datas[i]), + "error: expected output {} to match input {}".format( + output_data, input_datas[i] + ), + ) + + def test_decoupled_bls(self): + # Test combinations of BLS and decoupled API in Python backend. + model_name = "decoupled_bls" + shape = [1, 2] + user_data = UserData() + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as triton_client: + triton_client.start_stream(callback=partial(callback, user_data)) + + input_datas = [] + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + + # Check the results of the decoupled model using BLS + def check_result(result): + # Make sure the result is not an exception + self.assertIsNot(type(result), InferenceServerException) + + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + self.assertTrue( + np.array_equal(output_data, input_data), + "error: expected output {} to match input {}".format( + output_data, input_data + ), + ) + + result = user_data._completed_requests.get() + check_result(result) + + def test_decoupled_bls_stream(self): + # Test combinations of BLS and decoupled API in Python backend. + model_name = "decoupled_bls_stream" + in_values = [4, 2, 0, 1] + user_data = UserData() + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as triton_client: + triton_client.start_stream(callback=partial(callback, user_data)) + for i in range(len(in_values)): + input_data = np.array([in_values[i]], dtype=np.int32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer( + model_name=model_name, inputs=inputs, request_id=str(i) + ) + + # Retrieve results... + recv_count = 0 + expected_count = sum(in_values) + result_dict = {} + while recv_count < expected_count: + data_item = user_data._completed_requests.get() + self.assertIsNot(type(data_item), InferenceServerException) + + this_id = data_item.get_response().id + if this_id not in result_dict.keys(): + result_dict[this_id] = [] + result_dict[this_id].append((recv_count, data_item)) + + recv_count += 1 + # Validate results... + for i in range(len(in_values)): + this_id = str(i) + is_received = False + if this_id in result_dict.keys(): + is_received = True + + if in_values[i] != 0: + self.assertTrue( + is_received, + "response for request id {} not received".format(this_id), + ) + self.assertEqual(len(result_dict[this_id]), in_values[i]) + + result_list = result_dict[this_id] + expected_data = np.array([in_values[i]], dtype=np.int32) + for j in range(len(result_list)): + this_data = result_list[j][1].as_numpy("OUT") + self.assertTrue( + np.array_equal(expected_data, this_data), + "error: incorrect data: expected {}, got {}".format( + expected_data, this_data + ), + ) + else: + self.assertFalse( + is_received, + "received unexpected response for request id {}".format( + this_id + ), + ) + + def test_decoupled_return_response_error(self): + model_name = "decoupled_return_response_error" + shape = [16] + user_data = UserData() + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as client: + client.start_stream(callback=partial(callback, user_data)) + input_data_0 = np.random.random(shape).astype(np.float32) + input_data_1 = np.random.random(shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "INPUT0", + input_data_0.shape, + np_to_triton_dtype(input_data_0.dtype), + ), + grpcclient.InferInput( + "INPUT1", + input_data_1.shape, + np_to_triton_dtype(input_data_1.dtype), + ), + ] + inputs[0].set_data_from_numpy(input_data_0) + inputs[1].set_data_from_numpy(input_data_1) + client.async_stream_infer(model_name=model_name, inputs=inputs) + data_item = user_data._completed_requests.get() + if type(data_item) == InferenceServerException: + self.assertIn( + "Python model 'decoupled_return_response_error_0_0' is using " + "the decoupled mode and the execute function must return " + "None.", + data_item.message(), + "Exception message didn't show up.", + ) + + def test_decoupled_send_after_close_error(self): + model_name = "decoupled_send_after_close_error" + shape = [16] + user_data = UserData() + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as client: + client.start_stream(callback=partial(callback, user_data)) + input_data_0 = np.random.random(shape).astype(np.float32) + input_data_1 = np.random.random(shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "INPUT0", + input_data_0.shape, + np_to_triton_dtype(input_data_0.dtype), + ), + grpcclient.InferInput( + "INPUT1", + input_data_1.shape, + np_to_triton_dtype(input_data_1.dtype), + ), + ] + inputs[0].set_data_from_numpy(input_data_0) + inputs[1].set_data_from_numpy(input_data_1) + client.async_stream_infer(model_name=model_name, inputs=inputs) + + # Because the model has closed the response sender there is no + # way to deliver the error message to the client. The error + # will be logged on the server side. + time.sleep(4) + self.assertEqual( + user_data._completed_requests.qsize(), + 0, + "The completed request size must be zero.", + ) + + def test_decoupled_execute_cancel(self): + model_name = "execute_cancel" + log_path = "decoupled_server.log" + execute_delay = 4.0 # seconds + shape = [1, 1] + user_data = UserData() + + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as client: + client.start_stream(callback=partial(callback, user_data)) + input_data = np.array([[execute_delay]], dtype=np.float32) + inputs = [ + grpcclient.InferInput( + "EXECUTE_DELAY", shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + client.async_stream_infer(model_name, inputs) + time.sleep(2) # model delay for decoupling request and response sender + time.sleep(2) # ensure the request is executing + client.stop_stream(cancel_requests=True) + time.sleep(2) # ensure the cancellation is delivered + + self.assertFalse(user_data._completed_requests.empty()) + while not user_data._completed_requests.empty(): + data_item = user_data._completed_requests.get() + self.assertIsInstance(data_item, InferenceServerException) + self.assertEqual(data_item.status(), "StatusCode.CANCELLED") + + with open(log_path, mode="r", encoding="utf-8", errors="strict") as f: + log_text = f.read() + self.assertIn("[execute_cancel] Request not cancelled at 1.0 s", log_text) + self.assertIn("[execute_cancel] Request cancelled at ", log_text) + + def test_decoupled_raise_exception(self): + # The decoupled_raise_exception model raises an exception for the request. + # This test case is making sure that repeated exceptions are properly handled. + + model_name = "decoupled_raise_exception" + shape = [2, 2] + number_of_requests = 10 + user_data = UserData() + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as triton_client: + triton_client.start_stream(callback=partial(callback, user_data)) + + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + + for i in range(number_of_requests): + result = user_data._completed_requests.get() + self.assertIs(type(result), InferenceServerException) + self.assertIn("Intentional Error", result.message()) + + self.assertTrue(triton_client.is_model_ready(model_name)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py new file mode 100644 index 0000000000..db6be9f908 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py @@ -0,0 +1,321 @@ +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import sys +import threading +import time + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + + +class TritonPythonModel: + """This model sends an error message with the first request.""" + + def initialize(self, args): + logger = pb_utils.Logger + logger.log("Initialize-Specific Msg!", logger.INFO) + logger.log_info("Initialize-Info Msg!") + logger.log_warn("Initialize-Warning Msg!") + logger.log_error("Initialize-Error Msg!") + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args["model_config"]) + + using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config + ) + if not using_decoupled: + raise pb_utils.TritonModelException( + """the model `{}` can generate any number of responses per request, + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) + + # Get OUT configuration + out_config = pb_utils.get_output_config_by_name(model_config, "OUT") + + # Convert Triton types to numpy types + self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"]) + + self.inflight_thread_count = 0 + self.inflight_thread_count_lck = threading.Lock() + logger = pb_utils.Logger + logger.log("Initialize-Specific Msg!", logger.INFO) + logger.log_info("Initialize-Info Msg!") + logger.log_warn("Initialize-Warning Msg!") + logger.log_error("Initialize-Error Msg!") + + def execute(self, requests): + """This function is called on inference request.""" + logger = pb_utils.Logger + logger.log("Execute-Specific Msg!", logger.INFO) + logger.log_info("Execute-Info Msg!") + logger.log_warn("Execute-Warning Msg!") + logger.log_error("Execute-Error Msg!") + # Only generate the error for the first request + for i, request in enumerate(requests): + request_input = pb_utils.get_input_tensor_by_name(request, "IN") + + # Sync BLS request + infer_request = pb_utils.InferenceRequest( + model_name="identity_fp32", + requested_output_names=["OUTPUT0"], + inputs=[pb_utils.Tensor("INPUT0", request_input.as_numpy())], + ) + infer_response = infer_request.exec() + if infer_response.has_error(): + raise pb_utils.TritonModelException( + f"BLS Response has an error: {infer_response.error().message()}" + ) + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + if np.any(output0.as_numpy() != request_input.as_numpy()): + raise pb_utils.TritonModelException( + f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}" + ) + + thread1 = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(), + ), + ) + thread1.daemon = True + with self.inflight_thread_count_lck: + self.inflight_thread_count += 1 + thread1.start() + + logger = pb_utils.Logger + logger.log("Execute-Specific Msg!", logger.INFO) + logger.log_info("Execute-Info Msg!") + logger.log_warn("Execute-Warning Msg!") + logger.log_error("Execute-Error Msg!") + + return None + + def _get_gpu_bls_outputs(self, input0_pb, input1_pb): + """ + This function is created to test that the DLPack container works + properly when the inference response and outputs go out of scope. + + Returns True on success and False on failure. + """ + logger = pb_utils.Logger + logger.log("_get_gpu_bls_outputs-Specific Msg!", logger.INFO) + logger.log_info("_get_gpu_bls_outputs-Info Msg!") + logger.log_warn("_get_gpu_bls_outputs-Warning Msg!") + logger.log_error("_get_gpu_bls_outputs-Error Msg!") + + infer_request = pb_utils.InferenceRequest( + model_name="dlpack_add_sub", + inputs=[input0_pb, input1_pb], + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) + infer_response = infer_request.exec() + if infer_response.has_error(): + return False + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") + if output0 is None or output1 is None: + return False + + # When one of the inputs is in GPU the output returned by the model must + # be in GPU, otherwise the outputs will be in CPU. + if not input0_pb.is_cpu() or not input1_pb.is_cpu(): + if output0.is_cpu() or output1.is_cpu(): + return False + else: + if (not output0.is_cpu()) or (not output1.is_cpu()): + return False + + # Make sure that the reference count is increased by one when DLPack + # representation is created. + rc_before_dlpack_output0 = sys.getrefcount(output0) + rc_before_dlpack_output1 = sys.getrefcount(output1) + + output0_dlpack = output0.to_dlpack() + output1_dlpack = output1.to_dlpack() + + rc_after_dlpack_output0 = sys.getrefcount(output0) + rc_after_dlpack_output1 = sys.getrefcount(output1) + + if rc_after_dlpack_output0 - rc_before_dlpack_output0 != 1: + return False + + if rc_after_dlpack_output1 - rc_before_dlpack_output1 != 1: + return False + + # Make sure that reference count decreases after destroying the DLPack + output0_dlpack = None + output1_dlpack = None + rc_after_del_dlpack_output0 = sys.getrefcount(output0) + rc_after_del_dlpack_output1 = sys.getrefcount(output1) + if rc_after_del_dlpack_output0 - rc_after_dlpack_output0 != -1: + return False + + if rc_after_del_dlpack_output1 - rc_after_dlpack_output1 != -1: + return False + + return output0.to_dlpack(), output1.to_dlpack() + + def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu): + logger = pb_utils.Logger + logger.log("_test_gpu_bls_add_sub-Specific Msg!", logger.INFO) + logger.log_info("_test_gpu_bls_add_sub-Info Msg!") + logger.log_warn("_test_gpu_bls_add_sub-Warning Msg!") + logger.log_error("_test_gpu_bls_add_sub-Error Msg!") + + input0 = torch.rand(16) + input1 = torch.rand(16) + + if is_input0_gpu: + input0 = input0.to("cuda") + + if is_input1_gpu: + input1 = input1.to("cuda") + + input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0)) + input1_pb = pb_utils.Tensor.from_dlpack("INPUT1", to_dlpack(input1)) + gpu_bls_return = self._get_gpu_bls_outputs(input0_pb, input1_pb) + if gpu_bls_return: + output0_dlpack, output1_dlpack = gpu_bls_return + else: + return False + + expected_output_0 = from_dlpack(input0_pb.to_dlpack()).to("cpu") + from_dlpack( + input1_pb.to_dlpack() + ).to("cpu") + expected_output_1 = from_dlpack(input0_pb.to_dlpack()).to("cpu") - from_dlpack( + input1_pb.to_dlpack() + ).to("cpu") + + output0_matches = torch.all( + expected_output_0 == from_dlpack(output0_dlpack).to("cpu") + ) + output1_matches = torch.all( + expected_output_1 == from_dlpack(output1_dlpack).to("cpu") + ) + if not output0_matches or not output1_matches: + return False + + return True + + def execute_gpu_bls(self): + logger = pb_utils.Logger + logger.log("execute_gpu_bls-Specific Msg!", logger.INFO) + logger.log_info("execute_gpu_bls-Info Msg!") + logger.log_warn("execute_gpu_bls-Warning Msg!") + logger.log_error("execute_gpu_bls-Error Msg!") + for input0_device in [True, False]: + for input1_device in [True, False]: + test_status = self._test_gpu_bls_add_sub(input0_device, input1_device) + if not test_status: + return False + + return True + + def response_thread(self, response_sender, in_input): + # The response_sender is used to send response(s) associated with the + # corresponding request. + # Sleep 5 seconds to make sure the main thread has exited. + logger = pb_utils.Logger + logger.log("response_thread-Specific Msg!", logger.INFO) + logger.log_info("response_thread-Info Msg!") + logger.log_warn("response_thread-Warning Msg!") + logger.log_error("response_thread-Error Msg!") + time.sleep(5) + + # FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU + if sys.platform != "win32": + status = self.execute_gpu_bls() + else: + status = True + + if not status: + infer_response = pb_utils.InferenceResponse(error="GPU BLS test failed.") + response_sender.send(infer_response) + else: + in_value = in_input + infer_request = pb_utils.InferenceRequest( + model_name="identity_fp32", + requested_output_names=["OUTPUT0"], + inputs=[pb_utils.Tensor("INPUT0", in_input)], + ) + infer_response = infer_request.exec() + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + if infer_response.has_error(): + response = pb_utils.InferenceResponse( + error=infer_response.error().message() + ) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + elif np.any(in_input != output0.as_numpy()): + error_message = ( + "BLS Request input and BLS response output do not match." + f" {in_value} != {output0.as_numpy()}" + ) + response = pb_utils.InferenceResponse(error=error_message) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + output_tensors = [pb_utils.Tensor("OUT", in_value)] + response = pb_utils.InferenceResponse(output_tensors=output_tensors) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + + with self.inflight_thread_count_lck: + self.inflight_thread_count -= 1 + logger.log("response_thread-Specific Msg!", logger.INFO) + logger.log_info("response_thread-Info Msg!") + logger.log_warn("response_thread-Warning Msg!") + logger.log_error("response_thread-Error Msg!") + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + logger = pb_utils.Logger + logger.log_info("Finalize invoked") + + inflight_threads = True + while inflight_threads: + with self.inflight_thread_count_lck: + inflight_threads = self.inflight_thread_count != 0 + if inflight_threads: + time.sleep(0.1) + + logger.log_info("Finalize complete...") diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls/config.pbtxt b/qa/L0_backend_python/decoupled/models/decoupled_bls/config.pbtxt new file mode 100644 index 0000000000..aaefde24a5 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_bls/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "decoupled_bls" +backend: "python" +max_batch_size: 64 + +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py new file mode 100644 index 0000000000..8643482912 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py @@ -0,0 +1,132 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import threading +import time + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model sends a BLS request to a decoupled model 'square_int32' and + returns the output from 'square_int32' as responses. + """ + + def initialize(self, args): + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args["model_config"]) + + using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config + ) + if not using_decoupled: + raise pb_utils.TritonModelException( + """the model `{}` can generate any number of responses per request, + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) + + self.inflight_thread_count = 0 + self.inflight_thread_count_lck = threading.Lock() + + def execute(self, requests): + """This function is called on inference request.""" + + for request in requests: + thread = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(), + ), + ) + thread.daemon = True + with self.inflight_thread_count_lck: + self.inflight_thread_count += 1 + thread.start() + + return None + + def response_thread(self, response_sender, in_value): + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", + requested_output_names=["OUT"], + inputs=[pb_utils.Tensor("IN", in_value)], + ) + infer_responses = infer_request.exec(decoupled=True) + + response_count = 0 + for infer_response in infer_responses: + if len(infer_response.output_tensors()) > 0: + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + if infer_response.has_error(): + response = pb_utils.InferenceResponse( + error=infer_response.error().message() + ) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + elif np.any(in_value != output0.as_numpy()): + error_message = ( + "BLS Request input and BLS response output do not match." + f" {in_value} != {output0.as_numpy()}" + ) + response = pb_utils.InferenceResponse(error=error_message) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + output_tensors = [pb_utils.Tensor("OUT", output0.as_numpy())] + response = pb_utils.InferenceResponse(output_tensors=output_tensors) + response_sender.send(response) + + response_count += 1 + + if in_value != response_count - 1: + error_message = "Expected {} responses, got {}".format( + in_value, len(infer_responses) - 1 + ) + response = pb_utils.InferenceResponse(error=error_message) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + with self.inflight_thread_count_lck: + self.inflight_thread_count -= 1 + + def finalize(self): + inflight_threads = True + while inflight_threads: + with self.inflight_thread_count_lck: + inflight_threads = self.inflight_thread_count != 0 + if inflight_threads: + time.sleep(0.1) diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/config.pbtxt b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/config.pbtxt new file mode 100644 index 0000000000..23ad453212 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/config.pbtxt @@ -0,0 +1,54 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "decoupled_bls_stream" +backend: "python" + +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py new file mode 100644 index 0000000000..3882f0da9c --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py @@ -0,0 +1,125 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import threading +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model sends an error message with the first request.""" + + def initialize(self, args): + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args["model_config"]) + + using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config + ) + if not using_decoupled: + raise pb_utils.TritonModelException( + """the model `{}` can generate any number of responses per request, + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) + + # Get OUT configuration + out_config = pb_utils.get_output_config_by_name(model_config, "OUT") + + # Convert Triton types to numpy types + self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"]) + + self.inflight_thread_count = 0 + self.inflight_thread_count_lck = threading.Lock() + + def execute(self, requests): + """This function is called on inference request.""" + + # Only generate the error for the first request + for i, request in enumerate(requests): + # Start a separate thread to send the responses for the request. + thread = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + i, + pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(), + ), + ) + thread.daemon = True + + with self.inflight_thread_count_lck: + self.inflight_thread_count += 1 + + thread.start() + + return None + + def response_thread(self, response_sender, index, in_input): + # The response_sender is used to send response(s) associated with the + # corresponding request. The first request will send errors and the + # other requests will send the responses. The number of responses per + # requests is the number of elements in input tensor. + + in_value = in_input + out_output = pb_utils.Tensor("OUT", in_value) + + if index == 0: + error = pb_utils.TritonError("An error occurred during execution") + response = pb_utils.InferenceResponse( + output_tensors=[out_output], error=error + ) + else: + response = pb_utils.InferenceResponse(output_tensors=[out_output]) + response_sender.send(response) + + # We must close the response sender to indicate to Triton that we are + # done sending responses for the corresponding request. We can't use the + # response sender after closing it. + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + with self.inflight_thread_count_lck: + self.inflight_thread_count -= 1 + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Finalize invoked") + + inflight_threads = True + while inflight_threads: + with self.inflight_thread_count_lck: + inflight_threads = self.inflight_thread_count != 0 + if inflight_threads: + time.sleep(0.1) + + print("Finalize complete...") diff --git a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/config.pbtxt b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/config.pbtxt new file mode 100644 index 0000000000..37e62d4adb --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/config.pbtxt @@ -0,0 +1,57 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "decoupled_execute_error" +backend: "python" +max_batch_size: 64 + +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] + +dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 } diff --git a/qa/L0_backend_python/decoupled/models/decoupled_raise_exception/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_raise_exception/1/model.py new file mode 100644 index 0000000000..03a19db98d --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_raise_exception/1/model.py @@ -0,0 +1,35 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + def initialize(self, args): + pass + + def execute(self, requests): + for request in requests: + raise Exception("Intentional Error") + return None diff --git a/qa/L0_backend_python/decoupled/models/decoupled_raise_exception/config.pbtxt b/qa/L0_backend_python/decoupled/models/decoupled_raise_exception/config.pbtxt new file mode 100644 index 0000000000..046687dfe7 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_raise_exception/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "decoupled_raise_exception" +backend: "python" +max_batch_size: 64 + +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py new file mode 100644 index 0000000000..ecde9c7168 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py @@ -0,0 +1,82 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model tries to return a response directly from + execute function when configured as decoupled model. + """ + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config + ) + if not using_decoupled: + raise pb_utils.TritonModelException( + """the model `{}` can generate any number of responses per request, + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """Tries to create a response sender object and use that + for sending the response. + """ + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) + return responses diff --git a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/config.pbtxt b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/config.pbtxt new file mode 100644 index 0000000000..5d35d05ad6 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/config.pbtxt @@ -0,0 +1,61 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "decoupled_return_response_error" +backend: "python" +model_transaction_policy { + decoupled: True +} +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py new file mode 100644 index 0000000000..52aa17ac0d --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py @@ -0,0 +1,89 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model tries to send response after closing + the response_sender. + """ + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config + ) + if not using_decoupled: + raise pb_utils.TritonModelException( + """the model `{}` can generate any number of responses per request, + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """Create a response sender object and use that + for sending the response. + """ + + # This model does not support batching, so 'request_count' should always be 1. + if len(requests) != 1: + raise pb_utils.TritonModelException( + "unsupported batch size " + len(requests) + ) + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + response_sender = requests[0].get_response_sender() + in_0 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT1") + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + response = pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]) + + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send(response) diff --git a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/config.pbtxt b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/config.pbtxt new file mode 100644 index 0000000000..3c9443a6f0 --- /dev/null +++ b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/config.pbtxt @@ -0,0 +1,62 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "decoupled_send_after_close_error" +backend: "python" +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_backend_python/decoupled/test.sh b/qa/L0_backend_python/decoupled/test.sh new file mode 100755 index 0000000000..86455ff897 --- /dev/null +++ b/qa/L0_backend_python/decoupled/test.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +CLIENT_PY=./decoupled_test.py +CLIENT_LOG="./decoupled_client.log" +TEST_RESULT_FILE='test_results.txt' +SERVER_ARGS="--model-repository=${MODELDIR}/decoupled/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./decoupled_server.log" + +pip3 uninstall -y torch +# FIXME: Until Windows supports GPU tensors, only test CPU scenarios +if [[ ${TEST_WINDOWS} == 1 ]]; then + pip3 install torch==1.13.0 -f https://download.pytorch.org/whl/torch_stable.html +else + pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html +fi + +RET=0 +source ../../common/util.sh + +rm -fr *.log +mkdir -p models/identity_fp32/1/ +cp ../../python_models/identity_fp32/model.py models/identity_fp32/1/ +cp ../../python_models/identity_fp32/config.pbtxt models/identity_fp32/ + +mkdir -p models/execute_cancel/1/ +cp ../../python_models/execute_cancel/model.py ./models/execute_cancel/1/ +cp ../../python_models/execute_cancel/config.pbtxt ./models/execute_cancel/ +echo "model_transaction_policy { decoupled: True }" >> ./models/execute_cancel/config.pbtxt + +rm -fr python_backend +git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG +mkdir -p models/square_int32/1/ +cp python_backend/examples/decoupled/square_model.py models/square_int32/1/model.py +cp python_backend/examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt + +mkdir -p models/dlpack_add_sub/1/ +cp ../../python_models/dlpack_add_sub/model.py models/dlpack_add_sub/1/ +cp ../../python_models/dlpack_add_sub/config.pbtxt models/dlpack_add_sub/ + +function verify_log_counts () { + if [ `grep -c "Specific Msg!" $SERVER_LOG` -lt 1 ]; then + echo -e "\n***\n*** Test Failed: Specific Msg Count Incorrect\n***" + RET=1 + fi + if [ `grep -c "Info Msg!" $SERVER_LOG` -lt 1 ]; then + echo -e "\n***\n*** Test Failed: Info Msg Count Incorrect\n***" + RET=1 + fi + if [ `grep -c "Warning Msg!" $SERVER_LOG` -lt 1 ]; then + echo -e "\n***\n*** Test Failed: Warning Msg Count Incorrect\n***" + RET=1 + fi + if [ `grep -c "Error Msg!" $SERVER_LOG` -lt 1 ]; then + echo -e "\n***\n*** Test Failed: Error Msg Count Incorrect\n***" + RET=1 + fi + # NOTE: Windows does not seem to have a way to send a true SIGINT signal + # to tritonserver. Instead, it seems required to use taskkill.exe with /F (force) + # to kill the running program. This means the server terminates immediately, + # instead of shutting down how it would if Ctrl^C was invoked from the terminal. + # To properly test functionality, we need a WAR. + if [[ ${TEST_WINDOWS} == 0 ]]; then + if [ `grep -c "Finalize invoked" $SERVER_LOG` -ne 3 ]; then + echo -e "\n***\n*** Test Failed: 'Finalize invoked' message missing\n***" + RET=1 + fi + if [ `grep -c "Finalize complete..." $SERVER_LOG` -ne 3 ]; then + echo -e "\n***\n*** Test Failed: 'Finalize complete...' message missing\n***" + RET=1 + fi + fi +} + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 -m pytest --junitxml=decoupled.report.xml $CLIENT_PY > $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** decoupled test FAILED. \n***" + RET=1 +fi +set -e + +kill_server + +verify_log_counts + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Decoupled test FAILED. \n***" +else + echo -e "\n***\n*** Decoupled test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/ensemble/ensemble_test.py b/qa/L0_backend_python/ensemble/ensemble_test.py new file mode 100755 index 0000000000..521f59900f --- /dev/null +++ b/qa/L0_backend_python/ensemble/ensemble_test.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../../common") + +import unittest + +import numpy as np +import shm_util +import tritonclient.http as httpclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class EnsembleTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + + def infer(self, model_name): + shape = [16] + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data_0 = np.random.random(shape).astype(np.float32) + input_data_1 = np.random.random(shape).astype(np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", + input_data_0.shape, + np_to_triton_dtype(input_data_0.dtype), + ), + httpclient.InferInput( + "INPUT1", + input_data_1.shape, + np_to_triton_dtype(input_data_1.dtype), + ), + ] + inputs[0].set_data_from_numpy(input_data_0) + inputs[1].set_data_from_numpy(input_data_1) + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + output1 = result.as_numpy("OUTPUT1") + self.assertIsNotNone(output0) + self.assertIsNotNone(output1) + + # Set a big enough tolerance to reduce intermittence. May be + # better to test integer outputs in the future for consistency. + self.assertTrue(np.allclose(output0, 2 * input_data_0, atol=1e-06)) + self.assertTrue(np.allclose(output1, 2 * input_data_1, atol=1e-06)) + + def test_ensemble(self): + model_name = "ensemble" + self.infer(model_name) + + def test_ensemble_gpu(self): + model_name = "ensemble_gpu" + self.infer(model_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/ensemble/test.sh b/qa/L0_backend_python/ensemble/test.sh new file mode 100755 index 0000000000..3df9071a03 --- /dev/null +++ b/qa/L0_backend_python/ensemble/test.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_LOG="./ensemble_client.log" +source ../common.sh +source ../../common/util.sh + +# FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU scenarios +if [[ ${TEST_WINDOWS} == 1 ]]; then + EXPECTED_NUM_TESTS="1" +else + EXPECTED_NUM_TESTS="2" +fi + +SERVER_ARGS="--model-repository=${MODELDIR}/ensemble/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./ensemble_server.log" + +RET=0 +rm -rf models/ $CLIENT_LOG + +# Ensemble Model +mkdir -p models/ensemble/1/ +cp ../../python_models/ensemble/config.pbtxt ./models/ensemble + +mkdir -p models/add_sub_1/1/ +cp ../../python_models/add_sub/config.pbtxt ./models/add_sub_1 +cp ../../python_models/add_sub/model.py ./models/add_sub_1/1/ + +mkdir -p models/add_sub_2/1/ +cp ../../python_models/add_sub/config.pbtxt ./models/add_sub_2/ +cp ../../python_models/add_sub/model.py ./models/add_sub_2/1/ + +# FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU scenarios +if [[ ${TEST_WINDOWS} == 0 ]]; then + # Ensemble GPU Model + mkdir -p models/ensemble_gpu/1/ + cp ../../python_models/ensemble_gpu/config.pbtxt ./models/ensemble_gpu + cp -r ${DATADIR}/qa_model_repository/libtorch_float32_float32_float32/ ./models + (cd models/libtorch_float32_float32_float32 && \ + echo "instance_group [ { kind: KIND_GPU }]" >> config.pbtxt) + (cd models/libtorch_float32_float32_float32 && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 0/" config.pbtxt) + (cd models/libtorch_float32_float32_float32 && \ + sed -i "s/^version_policy:.*//" config.pbtxt) + rm -rf models/libtorch_float32_float32_float32/2 + rm -rf models/libtorch_float32_float32_float32/3 +fi + +prev_num_pages=`get_shm_pages` + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e + +# FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU scenarios +if [[ ${TEST_WINDOWS} == 0 ]]; then + python3 -m pytest --junitxml=ensemble.report.xml ensemble_test.py 2>&1 > $CLIENT_LOG +else + python3 ensemble_test.py EnsembleTest.test_ensemble 2>&1 > $CLIENT_LOG +fi + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** ensemble_test.py FAILED. \n***" + RET=1 +fi +set -e + +kill_server + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + RET=1 +fi + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Ensemble test FAILED. \n***" +else + echo -e "\n***\n*** Ensemble test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/env/test.sh b/qa/L0_backend_python/env/test.sh new file mode 100755 index 0000000000..ff9e368e75 --- /dev/null +++ b/qa/L0_backend_python/env/test.sh @@ -0,0 +1,319 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_LOG="./env_client.log" +source ../common.sh +source ../../common/util.sh + +BASE_SERVER_ARGS="--model-repository=${MODELDIR}/env/models --log-verbose=1 --disable-auto-complete-config" +PYTHON_BACKEND_BRANCH=$PYTHON_BACKEND_REPO_TAG +SERVER_ARGS=$BASE_SERVER_ARGS +SERVER_LOG="./env_server.log" + +RET=0 + +rm -fr ./models +rm -rf *.tar.gz +install_build_deps +install_conda + +# Tensorflow 2.1.0 only works with Python 3.4 - 3.7. Successful execution of +# the Python model indicates that the environment has been setup correctly. +# Create a model with python 3.7 version +create_conda_env "3.7" "python-3-7" +conda install numpy=1.20.1 -y +conda install tensorflow=2.1.0 -y +conda install -c conda-forge libstdcxx-ng=12 -y + +PY37_VERSION_STRING="Python version is 3.7, NumPy version is 1.20.1, and Tensorflow version is 2.1.0" +create_python_backend_stub +conda-pack -o python3.7.tar.gz +path_to_conda_pack=`pwd`/python3.7.tar.gz +mkdir -p models/python_3_7/1/ +cp ../../python_models/python_version/config.pbtxt ./models/python_3_7 +(cd models/python_3_7 && \ + sed -i "s/^name:.*/name: \"python_3_7\"/" config.pbtxt && \ + echo "parameters: {key: \"EXECUTION_ENV_PATH\", value: {string_value: \"$path_to_conda_pack\"}}">> config.pbtxt) +cp ../../python_models/python_version/model.py ./models/python_3_7/1/ +cp python_backend/builddir/triton_python_backend_stub ./models/python_3_7 +conda deactivate + +# Use python-3-7 without conda pack +# Create a model with python 3.7 version and numpy 1.20.3 to distinguish from +# previous test. +# Tensorflow 2.1.0 only works with Python 3.4 - 3.7. Successful execution of +# the Python model indicates that the environment has been setup correctly. +path_to_conda_pack="$PWD/python-3-7-1" +create_conda_env_with_specified_path "3.7" $path_to_conda_pack +conda install numpy=1.20.3 -y +conda install tensorflow=2.1.0 -y +conda install -c conda-forge libstdcxx-ng=12 -y + +PY37_1_VERSION_STRING="Python version is 3.7, NumPy version is 1.20.3, and Tensorflow version is 2.1.0" +create_python_backend_stub +mkdir -p models/python_3_7_1/1/ +cp ../../python_models/python_version/config.pbtxt ./models/python_3_7_1 +(cd models/python_3_7_1 && \ + sed -i "s/^name:.*/name: \"python_3_7_1\"/" config.pbtxt && \ + echo "parameters: {key: \"EXECUTION_ENV_PATH\", value: {string_value: \"$path_to_conda_pack\"}}">> config.pbtxt) +cp ../../python_models/python_version/model.py ./models/python_3_7_1/1/ +# Copy activate script to folder +cp $path_to_conda_pack/lib/python3.7/site-packages/conda_pack/scripts/posix/activate $path_to_conda_pack/bin/. +cp python_backend/builddir/triton_python_backend_stub ./models/python_3_7_1 +conda deactivate + +# Create a model with python 3.6 version +# Tensorflow 2.1.0 only works with Python 3.4 - 3.7. Successful execution of +# the Python model indicates that the environment has been setup correctly. +create_conda_env "3.6" "python-3-6" +conda install -c conda-forge libstdcxx-ng=12 -y +conda install numpy=1.18.1 -y +conda install tensorflow=2.1.0 -y +PY36_VERSION_STRING="Python version is 3.6, NumPy version is 1.18.1, and Tensorflow version is 2.1.0" +conda-pack -o python3.6.tar.gz + +# Test relative execution env path +path_to_conda_pack='$$TRITON_MODEL_DIRECTORY/python_3_6_environment.tar.gz' +create_python_backend_stub +mkdir -p models/python_3_6/1/ +cp ../../python_models/python_version/config.pbtxt ./models/python_3_6 +cp python3.6.tar.gz models/python_3_6/python_3_6_environment.tar.gz +(cd models/python_3_6 && \ + sed -i "s/^name:.*/name: \"python_3_6\"/" config.pbtxt && \ + echo "parameters: {key: \"EXECUTION_ENV_PATH\", value: {string_value: \"$path_to_conda_pack\"}}" >> config.pbtxt) +cp ../../python_models/python_version/model.py ./models/python_3_6/1/ +cp python_backend/builddir/triton_python_backend_stub ./models/python_3_6 +conda deactivate + +# Test conda env without custom Python backend stub This environment should +# always use the default Python version shipped in the container. For Ubuntu 22.04 +# it is Python 3.10 and for Ubuntu 20.04 is 3.8 +path_to_conda_pack='$$TRITON_MODEL_DIRECTORY/python_3_10_environment.tar.gz' +create_conda_env "3.10" "python-3-10" +conda install -c conda-forge libstdcxx-ng=12 -y +conda install numpy=1.23.4 -y +conda install tensorflow=2.10.0 -y +PY310_VERSION_STRING="Python version is 3.10, NumPy version is 1.23.4, and Tensorflow version is 2.10.0" +conda pack -o python3.10.tar.gz +mkdir -p models/python_3_10/1/ +cp ../../python_models/python_version/config.pbtxt ./models/python_3_10 +cp python3.10.tar.gz models/python_3_10/python_3_10_environment.tar.gz +(cd models/python_3_10 && \ + sed -i "s/^name:.*/name: \"python_3_10\"/" config.pbtxt && \ + echo "parameters: {key: \"EXECUTION_ENV_PATH\", value: {string_value: \"$path_to_conda_pack\"}}" >> config.pbtxt) +cp ../../python_models/python_version/model.py ./models/python_3_10/1/ +conda deactivate +rm -rf ./miniconda + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +set +e +for EXPECTED_VERSION_STRING in "$PY36_VERSION_STRING" "$PY37_VERSION_STRING" "$PY37_1_VERSION_STRING" "$PY310_VERSION_STRING"; do + grep "$EXPECTED_VERSION_STRING" $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** $EXPECTED_VERSION_STRING was not found in Triton logs. \n***" + RET=1 + fi +done + +# Test default (non set) locale in python stub processes +# NOTE: In certain pybind versions, the locale settings may not be propagated from parent to +# stub processes correctly. See https://github.com/triton-inference-server/python_backend/pull/260. +export LC_ALL=INVALID +grep "Locale is (None, None)" $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Default unset Locale was not found in Triton logs. \n***" + RET=1 + fi +set -e + +rm $SERVER_LOG + +# Test locale set via environment variable in python stub processes +# NOTE: In certain pybind versions, the locale settings may not be propagated from parent to +# stub processes correctly. See https://github.com/triton-inference-server/python_backend/pull/260. +export LC_ALL=C.UTF-8 +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +set +e +grep "Locale is ('en_US', 'UTF-8')" $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Locale UTF-8 was not found in Triton logs. \n***" + RET=1 + fi +set -e + +rm $SERVER_LOG + +## Test re-extraction of environment. +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1 --model-control-mode=explicit" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# The environment should be extracted +curl -v -X POST localhost:8000/v2/repository/models/python_3_10/load +touch -m models/python_3_10/1/model.py +# The environment should not be re-extracted +curl -v -X POST localhost:8000/v2/repository/models/python_3_10/load +touch -m models/python_3_10/python_3_10_environment.tar.gz +# The environment should be re-extracted +curl -v -X POST localhost:8000/v2/repository/models/python_3_10/load + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +PY310_ENV_EXTRACTION="Extracting Python execution env" +if [ `grep -c "${PY310_ENV_EXTRACTION}" ${SERVER_LOG}` != "2" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Python execution environment should be extracted exactly twice. \n***" + RET=1 +fi +set -e + +# Test execution environments with S3 +# S3 credentials are necessary for this test. Pass via ENV variables +aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + +# S3 bucket path (Point to bucket when testing cloud storage) +BUCKET_URL="s3://triton-bucket-${CI_JOB_ID}" + +# Cleanup and delete S3 test bucket if it already exists (due to test failure) +aws s3 rm $BUCKET_URL --recursive --include "*" && \ + aws s3 rb $BUCKET_URL || true + +# Make S3 test bucket +aws s3 mb "${BUCKET_URL}" + +# Remove Slash in BUCKET_URL +BUCKET_URL=${BUCKET_URL%/} +BUCKET_URL_SLASH="${BUCKET_URL}/" + +# Remove Python 3.7 model because it contains absolute paths and cannot be used +# with S3. +rm -rf models/python_3_7 + +# Test with the bucket url as model repository +aws s3 cp models/ "${BUCKET_URL_SLASH}" --recursive --include "*" + +rm $SERVER_LOG + +SERVER_ARGS="--model-repository=$BUCKET_URL_SLASH --log-verbose=1" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + aws s3 rb "${BUCKET_URL}" --force || true + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +set +e +grep "$PY36_VERSION_STRING" $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** $PY36_VERSION_STRING was not found in Triton logs. \n***" + RET=1 +fi +set -e + +# Clean up bucket contents +aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" + +# Test with EXECUTION_ENV_PATH outside the model directory +sed -i "s/TRITON_MODEL_DIRECTORY\/python_3_6_environment/TRITON_MODEL_DIRECTORY\/..\/python_3_6_environment/" models/python_3_6/config.pbtxt +mv models/python_3_6/python_3_6_environment.tar.gz models +sed -i "s/\$\$TRITON_MODEL_DIRECTORY\/python_3_10_environment/s3:\/\/triton-bucket-${CI_JOB_ID}\/python_3_10_environment/" models/python_3_10/config.pbtxt +mv models/python_3_10/python_3_10_environment.tar.gz models + +aws s3 cp models/ "${BUCKET_URL_SLASH}" --recursive --include "*" + +rm $SERVER_LOG + +SERVER_ARGS="--model-repository=$BUCKET_URL_SLASH --log-verbose=1" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + aws s3 rb "${BUCKET_URL}" --force || true + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +set +e +for EXPECTED_VERSION_STRING in "$PY36_VERSION_STRING" "$PY310_VERSION_STRING"; do + grep "$EXPECTED_VERSION_STRING" $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** $EXPECTED_VERSION_STRING was not found in Triton logs. \n***" + RET=1 + fi +done +set -e + +# Clean up bucket contents and delete bucket +aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" +aws s3 rb "${BUCKET_URL}" + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Env Manager Test PASSED.\n***" +else + cat $SERVER_LOG + echo -e "\n***\n*** Env Manager Test FAILED.\n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/examples/test.sh b/qa/L0_backend_python/examples/test.sh new file mode 100755 index 0000000000..998e6c2038 --- /dev/null +++ b/qa/L0_backend_python/examples/test.sh @@ -0,0 +1,443 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../common.sh +source ../../common/util.sh + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} + +SERVER_ARGS="--model-repository=${MODELDIR}/examples/python_backend/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./examples_server.log" + +RET=0 +rm -fr *.log python_backend/ + +# Install torch +pip3 uninstall -y torch +if [ "$TEST_JETSON" == "0" ] && [[ ${TEST_WINDOWS} == 0 ]]; then + pip3 install torch==2.0.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html torchvision==0.15.0+cu117 +else + pip3 install torch==2.0.0 -f https://download.pytorch.org/whl/torch_stable.html torchvision==0.15.0 +fi + +# Install `validators` for Model Instance Kind example +pip3 install validators + +# Install JAX +# Jax has dropped the support for Python 3.8. See https://jax.readthedocs.io/en/latest/changelog.html +if [ "$TEST_JETSON" == "0" ] && [ ${PYTHON_ENV_VERSION} != "8" ]; then + pip3 install --upgrade "jax[cuda12_local]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +fi + +git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG +cd python_backend + +# Example 1 +CLIENT_LOG="../examples_add_sub_client.log" +mkdir -p models/add_sub/1/ +cp examples/add_sub/model.py models/add_sub/1/model.py +cp examples/add_sub/config.pbtxt models/add_sub/config.pbtxt +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/add_sub/client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify add_sub example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify add_sub example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Example 2 +CLIENT_LOG="../examples_pytorch_client.log" +mkdir -p models/pytorch/1/ +cp examples/pytorch/model.py models/pytorch/1/model.py +cp examples/pytorch/config.pbtxt models/pytorch/config.pbtxt +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/pytorch/client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify pytorch example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify pytorch example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Example 3 + +# JAX AddSub +# JAX is not supported on Jetson +# Jax has dropped the support for Python 3.8. See https://jax.readthedocs.io/en/latest/changelog.html +if [ "$TEST_JETSON" == "0" ] && [ ${PYTHON_ENV_VERSION} != "8" ]; then + CLIENT_LOG="../examples_jax_client.log" + mkdir -p models/jax/1/ + cp examples/jax/model.py models/jax/1/model.py + cp examples/jax/config.pbtxt models/jax/config.pbtxt + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + fi + + set +e + python3 examples/jax/client.py > $CLIENT_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify jax example. \n***" + RET=1 + fi + + grep "PASS" $CLIENT_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify jax example. \n***" + cat $CLIENT_LOG + RET=1 + fi + set -e + + kill_server +fi + +# Example 4 + +# BLS Sync +CLIENT_LOG="../examples_sync_client.log" +mkdir -p models/bls_sync/1 +cp examples/bls/sync_model.py models/bls_sync/1/model.py +cp examples/bls/sync_config.pbtxt models/bls_sync/config.pbtxt +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/bls/sync_client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS sync example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS sync example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Example 5 + +# Decoupled Repeat +CLIENT_LOG="../examples_repeat_client.log" +mkdir -p models/repeat_int32/1/ +cp examples/decoupled/repeat_model.py models/repeat_int32/1/model.py +cp examples/decoupled/repeat_config.pbtxt models/repeat_int32/config.pbtxt +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/decoupled/repeat_client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify repeat_int32 example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify repeat_int32 example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Example 6 + +# Decoupled Square +CLIENT_LOG="../examples_square_client.log" +mkdir -p models/square_int32/1/ +cp examples/decoupled/square_model.py models/square_int32/1/model.py +cp examples/decoupled/square_config.pbtxt models/square_int32/config.pbtxt +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/decoupled/square_client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify square_int32 example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify square_int32 example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# +# BLS Async +# +# Skip async BLS on Jetson since it is not supported with python3.6 +# Having multiple python versions lead to build issues. +# Anaconda is not officially supported on Jetson. +if [ "$TEST_JETSON" == "0" ]; then + CLIENT_LOG="../examples_async_client.log" + mkdir -p models/bls_async/1 + cp examples/bls/async_model.py models/bls_async/1/model.py + cp examples/bls/async_config.pbtxt models/bls_async/config.pbtxt + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + fi + + set +e + python3 examples/bls/async_client.py > $CLIENT_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS async example. \n***" + RET=1 + fi + + grep "PASS" $CLIENT_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS async example. \n***" + cat $CLIENT_LOG + RET=1 + fi + + set -e + + kill_server +fi + +# Auto Complete Model Configuration Example +CLIENT_LOG="../examples_auto_complete_client.log" +mkdir -p models/nobatch_auto_complete/1/ +mkdir -p models/batch_auto_complete/1/ +cp examples/auto_complete/nobatch_model.py models/nobatch_auto_complete/1/model.py +cp examples/auto_complete/batch_model.py models/batch_auto_complete/1/model.py + +SERVER_ARGS="$SERVER_ARGS --strict-model-config=false" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/auto_complete/client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify auto_complete example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify auto_complete example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# BLS Decoupled Sync +CLIENT_LOG="../examples_bls_decoupled_sync_client.log" +mkdir -p models/bls_decoupled_sync/1 +cp examples/bls_decoupled/sync_model.py models/bls_decoupled_sync/1/model.py +cp examples/bls_decoupled/sync_config.pbtxt models/bls_decoupled_sync/config.pbtxt +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/bls_decoupled/sync_client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS Decoupled Sync example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS Decoupled Sync example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# BLS Decoupled Async +if [ "$TEST_JETSON" == "0" ]; then + CLIENT_LOG="../examples_bls_decoupled_async_client.log" + mkdir -p models/bls_decoupled_async/1 + cp examples/bls_decoupled/async_model.py models/bls_decoupled_async/1/model.py + cp examples/bls_decoupled/async_config.pbtxt models/bls_decoupled_async/config.pbtxt + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + fi + + set +e + python3 examples/bls_decoupled/async_client.py > $CLIENT_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS Decoupled Async example. \n***" + RET=1 + fi + + grep "PASS" $CLIENT_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify BLS Decoupled Async example. \n***" + cat $CLIENT_LOG + RET=1 + fi + + set -e + + kill_server +fi + +# Example 7 + +# Model Instance Kind +CLIENT_LOG="../examples_model_instance_kind.log" +mkdir -p models/resnet50/1 +cp examples/instance_kind/model.py models/resnet50/1/ +cp examples/instance_kind/config.pbtxt models/resnet50/ +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/instance_kind/client.py --label_file examples/instance_kind/resnet50_labels.txt > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify Model instance Kind example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify Model Instance Kind example. Example failed to pass. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Custom Metrics +CLIENT_LOG="../examples_custom_metrics_client.log" +mkdir -p models/custom_metrics/1 +cp examples/custom_metrics/model.py models/custom_metrics/1/model.py +cp examples/custom_metrics/config.pbtxt models/custom_metrics/config.pbtxt +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +python3 examples/custom_metrics/client.py > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify Custom Metrics example. \n***" + RET=1 +fi + +grep "PASS" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify Custom Metrics example. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Example verification test PASSED.\n***" +else + echo -e "\n***\n*** Example verification test FAILED.\n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/io/io_test.py b/qa/L0_backend_python/io/io_test.py new file mode 100755 index 0000000000..a047d3aa6a --- /dev/null +++ b/qa/L0_backend_python/io/io_test.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../../common") + +import itertools +import queue +import unittest +from functools import partial + +import numpy as np +import shm_util +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +TRIAL = os.getenv("TRIAL") + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class IOTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + self._client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + + def _run_ensemble_test(self, model_name): + user_data = UserData() + input0 = np.random.random([1000]).astype(np.float32) + # Use context manager to close client stream if any early exit occurs + with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client: + client.start_stream(callback=partial(callback, user_data)) + # Each pair represents whether the corresponding model is in GPU or not. + gpu_flags = [(True, False), (True, False), (True, False)] + # Create iterable of all possible combinations of each model gpu location + # ex: (True, True, True), (True, True, False), (True, False, True), ... + combinations = itertools.product(*gpu_flags) + for model_1_in_gpu, model_2_in_gpu, model_3_in_gpu in combinations: + gpu_output = np.asarray( + [model_1_in_gpu, model_2_in_gpu, model_3_in_gpu], dtype=bool + ) + inputs = [ + grpcclient.InferInput( + "INPUT0", input0.shape, np_to_triton_dtype(input0.dtype) + ), + grpcclient.InferInput( + "GPU_OUTPUT", + gpu_output.shape, + np_to_triton_dtype(gpu_output.dtype), + ), + ] + inputs[0].set_data_from_numpy(input0) + inputs[1].set_data_from_numpy(gpu_output) + client.async_stream_infer(model_name=model_name, inputs=inputs) + if TRIAL == "default": + result = user_data._completed_requests.get() + output0 = result.as_numpy("OUTPUT0") + self.assertIsNotNone(output0) + self.assertTrue(np.all(output0 == input0)) + else: + response_repeat = 2 + for _ in range(response_repeat): + result = user_data._completed_requests.get() + output0 = result.as_numpy("OUTPUT0") + self.assertIsNotNone(output0) + self.assertTrue(np.all(output0 == input0)) + + def test_ensemble_io(self): + model_name = "ensemble_io" + + # FIXME: This test detects a decrease of 80 bytes, which fails inequality check: + # [ensemble_io] Shared memory leak detected: 1006976 (current) != 1007056 (prev). + # so Probe was modified to check for growth instead of inequality. + with self._shm_leak_detector.Probe(): + self._run_ensemble_test(model_name) + + def test_empty_gpu_output(self): + model_name = "dlpack_empty_output" + with self._shm_leak_detector.Probe(): + input_data = np.array([[1.0]], dtype=np.float32) + inputs = [ + grpcclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + result = self._client.infer(model_name, inputs) + output = result.as_numpy("OUTPUT") + self.assertIsNotNone(output) + self.assertEqual(output.size, 0) + + def test_variable_gpu_output(self): + model_name = "variable_gpu_output" + with self._shm_leak_detector.Probe(): + # Input is not important in this test + input_data = np.array([[1.0]], dtype=np.float32) + inputs = [ + grpcclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + user_data = UserData() + + # The test sends five requests to the model and the model returns five + # responses with different GPU output shapes + num_requests = 5 + for _ in range(num_requests): + _ = self._client.async_infer( + model_name=model_name, + inputs=inputs, + callback=partial(callback, user_data), + ) + + for i in range(num_requests): + result = user_data._completed_requests.get() + if result is InferenceServerException: + self.assertTrue(False, result) + output = result.as_numpy("OUTPUT") + self.assertIsNotNone(output) + self.assertEqual(output.size, i + 1) + np.testing.assert_almost_equal(output, np.ones(i + 1) * (i + 1)) + + # Non-decoupled models should filter outputs base on requested outputs. + def test_requested_output_default(self): + model_name = "add_sub" + shape = [16] + + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + grpcclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), + ] + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + # request for output 1, among output 0 and 1. + requested_outputs = [grpcclient.InferRequestedOutput("OUTPUT1")] + with self._shm_leak_detector.Probe(): + response = self._client.infer( + model_name=model_name, + inputs=inputs, + outputs=requested_outputs, + ) + outputs = response.get_response().outputs + self.assertEqual(len(outputs), len(requested_outputs)) + output1_data = response.as_numpy("OUTPUT1") + self.assertTrue(np.allclose(input0_data - input1_data, output1_data)) + + # without requested output should return all outputs + with self._shm_leak_detector.Probe(): + response = self._client.infer(model_name=model_name, inputs=inputs) + outputs = response.get_response().outputs + self.assertEqual(len(outputs), len(inputs)) + output0_data = response.as_numpy("OUTPUT0") + output1_data = response.as_numpy("OUTPUT1") + self.assertTrue(np.allclose(input0_data + input1_data, output0_data)) + self.assertTrue(np.allclose(input0_data - input1_data, output1_data)) + + # Decoupled models should filter outputs base on requested outputs. + def test_requested_output_decoupled(self): + model_name = "dlpack_io_identity_decoupled" + shape = [4] + expected_response_repeat = 2 + + input0_data = np.random.rand(*shape).astype(np.float32) + gpu_output_data = np.random.rand(*shape).astype(np.bool_) + inputs = [ + grpcclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + grpcclient.InferInput( + "GPU_OUTPUT", + gpu_output_data.shape, + np_to_triton_dtype(gpu_output_data.dtype), + ), + ] + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(gpu_output_data) + + # request for output 0, among output 0 and next gpu output. + requested_outputs = [grpcclient.InferRequestedOutput("OUTPUT0")] + user_data = UserData() + with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client: + client.start_stream(callback=partial(callback, user_data)) + client.async_stream_infer( + model_name=model_name, inputs=inputs, outputs=requested_outputs + ) + client.stop_stream() + for _ in range(expected_response_repeat): + self.assertFalse(user_data._completed_requests.empty()) + response = user_data._completed_requests.get() + outputs = response.get_response().outputs + self.assertEqual(len(outputs), len(requested_outputs)) + output0_data = response.as_numpy("OUTPUT0") + self.assertTrue(np.allclose(input0_data, output0_data)) + self.assertTrue(user_data._completed_requests.empty()) + + # without requested output should return all outputs + user_data = UserData() + with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client: + client.start_stream(callback=partial(callback, user_data)) + client.async_stream_infer(model_name=model_name, inputs=inputs) + client.stop_stream() + for _ in range(expected_response_repeat): + self.assertFalse(user_data._completed_requests.empty()) + response = user_data._completed_requests.get() + outputs = response.get_response().outputs + self.assertEqual(len(outputs), len(inputs)) + output0_data = response.as_numpy("OUTPUT0") + next_gpu_output_data = response.as_numpy("NEXT_GPU_OUTPUT") + self.assertTrue(np.allclose(input0_data, output0_data)) + self.assertTrue(np.allclose(gpu_output_data[1:], next_gpu_output_data)) + self.assertTrue(user_data._completed_requests.empty()) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/io/test.sh b/qa/L0_backend_python/io/test.sh new file mode 100755 index 0000000000..e58cd3584f --- /dev/null +++ b/qa/L0_backend_python/io/test.sh @@ -0,0 +1,185 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +UNITTEST_PY=./io_test.py +CLIENT_LOG="./io_client.log" +TEST_RESULT_FILE='test_results.txt' +source ../common.sh +source ../../common/util.sh + +SERVER_ARGS="--model-repository=${MODELDIR}/io/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./io_server.log" + +RET=0 +rm -fr *.log ./models + +pip3 uninstall -y torch +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html + +# IOTest.test_ensemble_io +TRIALS="default decoupled" + +for trial in $TRIALS; do + export TRIAL=$trial + rm -rf ./models + + if [ $trial = "default" ]; then + for i in {1..3}; do + model_name=dlpack_io_identity_$i + mkdir -p models/$model_name/1/ + cp ../../python_models/dlpack_io_identity/model.py ./models/$model_name/1/ + cp ../../python_models/dlpack_io_identity/config.pbtxt ./models/$model_name/ + (cd models/$model_name && \ + sed -i "s/^name:.*/name: \"$model_name\"/" config.pbtxt) + done + else + for i in {1..3}; do + model_name=dlpack_io_identity_$i + mkdir -p models/$model_name/1/ + cp ../../python_models/dlpack_io_identity_decoupled/model.py ./models/$model_name/1/ + cp ../../python_models/dlpack_io_identity_decoupled/config.pbtxt ./models/$model_name/ + (cd models/$model_name && \ + sed -i "s/^name:.*/name: \"$model_name\"/" config.pbtxt) + done + fi + + mkdir -p models/ensemble_io/1/ + cp ../../python_models/ensemble_io/config.pbtxt ./models/ensemble_io + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + fi + + set +e + SUBTEST="test_ensemble_io" + python3 -m pytest --junitxml=${SUBTEST}.${TRIAL}.report.xml ${UNITTEST_PY}::IOTest::${SUBTEST} >> ${CLIENT_LOG}.${SUBTEST} + if [ $? -ne 0 ]; then + echo -e "\n***\n*** IOTest.${SUBTEST} FAILED. \n***" + cat $CLIENT_LOG.${SUBTEST} + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# IOTest.test_empty_gpu_output +rm -rf models && mkdir models +mkdir -p models/dlpack_empty_output/1/ +cp ../../python_models/dlpack_empty_output/model.py ./models/dlpack_empty_output/1/ +cp ../../python_models/dlpack_empty_output/config.pbtxt ./models/dlpack_empty_output/ + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +SUBTEST="test_empty_gpu_output" +python3 -m pytest --junitxml=${SUBTEST}.report.xml ${UNITTEST_PY}::IOTest::${SUBTEST} > ${CLIENT_LOG}.${SUBTEST} + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** IOTest.${SUBTEST} FAILED. \n***" + cat $CLIENT_LOG.${SUBTEST} + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# IOTest.test_variable_gpu_output +rm -rf models && mkdir models +mkdir -p models/variable_gpu_output/1/ +cp ../../python_models/variable_gpu_output/model.py ./models/variable_gpu_output/1/ +cp ../../python_models/variable_gpu_output/config.pbtxt ./models/variable_gpu_output/ + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +SUBTEST="test_variable_gpu_output" +python3 -m pytest --junitxml=${SUBTEST}.report.xml ${UNITTEST_PY}::IOTest::${SUBTEST} > ${CLIENT_LOG}.${SUBTEST} + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** IOTest.${SUBTEST} FAILED. \n***" + cat $CLIENT_LOG.${SUBTEST} + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# IOTest.test_requested_output_default & IOTest.test_requested_output_decoupled +rm -rf models && mkdir models +mkdir -p models/add_sub/1/ +cp ../../python_models/add_sub/model.py ./models/add_sub/1/ +cp ../../python_models/add_sub/config.pbtxt ./models/add_sub/ +mkdir -p models/dlpack_io_identity_decoupled/1/ +cp ../../python_models/dlpack_io_identity_decoupled/model.py ./models/dlpack_io_identity_decoupled/1/ +cp ../../python_models/dlpack_io_identity_decoupled/config.pbtxt ./models/dlpack_io_identity_decoupled/ + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +SUBTESTS="test_requested_output_default test_requested_output_decoupled" +for SUBTEST in $SUBTESTS; do + set +e + python3 -m pytest --junitxml=${SUBTEST}.report.xml ${UNITTEST_PY}::IOTest::${SUBTEST} > ${CLIENT_LOG}.${SUBTEST} + if [ $? -ne 0 ]; then + echo -e "\n***\n*** IOTest.${SUBTEST} FAILED. \n***" + cat $CLIENT_LOG.${SUBTEST} + RET=1 + fi + set -e +done + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** IO test PASSED.\n***" +else + echo -e "\n***\n*** IO test FAILED.\n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py new file mode 100755 index 0000000000..d6eb2a8f53 --- /dev/null +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import sys + +import requests + +sys.path.append("../../common") + +import queue +import threading +import time +import unittest +from functools import partial + +import numpy as np +import shm_util +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class LifecycleTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metrics_before_test(self, model, reason): + pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)' + metrics = self._get_metrics() + match = re.search(pattern, metrics) + if match: + return int(match.group(1)) + else: + raise Exception(f"Failure metrics for model='{model}' not found") + + def _assert_metrics( + self, model_name, reason, expected_count_increase, initial_count + ): + metrics = self._get_metrics() + # Add initial count + expected count for the the test + expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}' + self.assertIn(expected_metric, metrics) + + def test_error_code(self): + model_name = "error_code" + shape = [1, 1] + # [(Triton error, expected gRPC error message starting), ...] + errors = [ + ("UNKNOWN", "[StatusCode.UNKNOWN]"), + ("INTERNAL", "[StatusCode.INTERNAL]"), + ("NOT_FOUND", "[StatusCode.NOT_FOUND]"), + ("INVALID_ARG", "[StatusCode.INVALID_ARGUMENT]"), + ("UNAVAILABLE", "[StatusCode.UNAVAILABLE]"), + ("UNSUPPORTED", "[StatusCode.UNIMPLEMENTED]"), + ("ALREADY_EXISTS", "[StatusCode.ALREADY_EXISTS]"), + ("CANCELLED", "[StatusCode.CANCELLED]"), + ("(default)", "[StatusCode.INTERNAL] unrecognized"), + ] + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as client: + for error, expected_grpc_error_start in errors: + input_data = np.array([[error]], dtype=np.object_) + inputs = [ + grpcclient.InferInput( + "ERROR_CODE", shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + with self.assertRaises(InferenceServerException) as e: + client.infer(model_name, inputs) + # e.g. [StatusCode.UNKNOWN] error code: TRITONSERVER_ERROR_UNKNOWN + # e.g. [StatusCode.INTERNAL] unrecognized error code: (default) + self.assertEqual( + str(e.exception), + expected_grpc_error_start + " error code: " + error, + ) + + def test_execute_cancel(self): + model_name = "execute_cancel" + log_path = "lifecycle_server.log" + execute_delay = 4.0 # seconds + shape = [1, 1] + response = {"responded": False, "result": None, "error": None} + + def callback(result, error): + response["responded"] = True + response["result"] = result + response["error"] = error + + with self._shm_leak_detector.Probe() as shm_probe: + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as client: + input_data = np.array([[execute_delay]], dtype=np.float32) + inputs = [ + grpcclient.InferInput( + "EXECUTE_DELAY", shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + exec_future = client.async_infer(model_name, inputs, callback) + time.sleep(2) # ensure the request is executing + self.assertFalse(response["responded"]) + exec_future.cancel() + time.sleep(2) # ensure the cancellation is delivered + self.assertTrue(response["responded"]) + + self.assertEqual(response["result"], None) + self.assertIsInstance(response["error"], InferenceServerException) + self.assertEqual(response["error"].status(), "StatusCode.CANCELLED") + with open(log_path, mode="r", encoding="utf-8", errors="strict") as f: + log_text = f.read() + self.assertIn("[execute_cancel] Request not cancelled at 1.0 s", log_text) + self.assertIn("[execute_cancel] Request cancelled at ", log_text) + + def test_batch_error(self): + # The execute_error model returns an error for the first and third + # request and successfully processes the second request. This is making + # sure that an error in a single request does not completely fail the + # batch. + model_name = "execute_error" + shape = [2, 2] + number_of_requests = 3 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + triton_client.start_stream(callback=partial(callback, user_data)) + + with self._shm_leak_detector.Probe() as shm_probe: + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + + for i in range(number_of_requests): + result = user_data._completed_requests.get() + if i == 0 or i == 2: + self.assertIs(type(result), InferenceServerException) + continue + + print(result) + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + self.assertTrue( + np.array_equal(output_data, input_datas[i]), + "error: expected output {} to match input {}".format( + output_data, input_datas[i] + ), + ) + + def test_infer_pymodel_error(self): + model_name = "wrong_model" + shape = [2, 2] + initial_metrics_value = self._metrics_before_test(model_name, "BACKEND") + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = (16384 * np.random.randn(*shape)).astype(np.uint32) + inputs = [ + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + client.infer(model_name, inputs) + except InferenceServerException as e: + print(e.message()) + self.assertTrue( + e.message().startswith( + "Failed to process the request(s) for model " + ), + "Exception message is not correct", + ) + else: + self.assertTrue( + False, "Wrong exception raised or did not raise an exception" + ) + expected_count_increase = 1 + self._assert_metrics( + model_name, + "BACKEND", + expected_count_increase, + initial_metrics_value, + ) + + # Test grpc stream behavior when triton_grpc_error is set to true. + # Expected to close stream and return GRPC error when model returns error. + def test_triton_grpc_error_error_on(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 2 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + stream_end = False + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request + self.assertEqual(str(result.status()), "StatusCode.INTERNAL") + stream_end = True + else: + # Stream is not killed + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + + # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams. + # Expected to close stream and return GRPC error when model returns error. + def test_triton_grpc_error_multithreaded(self): + thread1 = threading.Thread(target=self.test_triton_grpc_error_error_on) + thread2 = threading.Thread(target=self.test_triton_grpc_error_error_on) + # Start the threads + thread1.start() + thread2.start() + # Wait for both threads to finish + thread1.join() + thread2.join() + + # Test grpc stream behavior when triton_grpc_error is set to true and subsequent stream is cancelled. + # Expected cancellation is successful. + def test_triton_grpc_error_cancel(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 1 + user_data = UserData() + triton_server_url = "localhost:8001" # Replace with your Triton server address + stream_end = False + triton_client = grpcclient.InferenceServerClient(triton_server_url) + + metadata = {"triton_grpc_error": "true"} + + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + stream_end = True + if i == 0: + triton_client.stop_stream(cancel_requests=True) + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + self.assertTrue( + True, + "This should always pass as cancellation should succeed without any exception", + ) + + # Test grpc stream behavior when triton_grpc_error is set to false + # and subsequent stream is NOT closed when error is reported from CORE + def test_triton_grpc_error_error_off(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 4 + response_counter = 0 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + triton_client.start_stream(callback=partial(callback, user_data)) + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + _ = user_data._completed_requests.get() + response_counter += 1 + # we expect response_counter == number_of_requests, + # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error + self.assertEqual(response_counter, number_of_requests) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh new file mode 100755 index 0000000000..59b846f56b --- /dev/null +++ b/qa/L0_backend_python/lifecycle/test.sh @@ -0,0 +1,215 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_LOG="./lifecycle_client.log" +TEST_RESULT_FILE='test_results.txt' +source ../common.sh +source ../../common/util.sh + +SERVER_ARGS="--model-repository=${MODELDIR}/lifecycle/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./lifecycle_server.log" + +RET=0 +rm -fr *.log ./models + +mkdir -p models/error_code/1/ +cp ../../python_models/error_code/model.py ./models/error_code/1/ +cp ../../python_models/error_code/config.pbtxt ./models/error_code/ + +mkdir -p models/execute_cancel/1/ +cp ../../python_models/execute_cancel/model.py ./models/execute_cancel/1/ +cp ../../python_models/execute_cancel/config.pbtxt ./models/execute_cancel/ + +mkdir -p models/execute_error/1/ +cp ../../python_models/execute_error/model.py ./models/execute_error/1/ +cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/ +(cd models/execute_error && \ + sed -i "s/^name:.*/name: \"execute_error\"/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt) + +mkdir -p models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/ +(cd models/execute_grpc_error && \ + sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt) + +mkdir -p models/execute_return_error/1/ +cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/ +cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/ + +mkdir -p models/wrong_model/1/ +cp ../../python_models/wrong_model/model.py ./models/wrong_model/1/ +cp ../../python_models/wrong_model/config.pbtxt ./models/wrong_model/ +(cd models/wrong_model && \ + sed -i "s/^name:.*/name: \"wrong_model\"/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_UINT32/g" config.pbtxt) + +prev_num_pages=`get_shm_pages` + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e + +# Run this multiple times to catch any intermittent segfault. +for i in {0..4}; do + python3 -m pytest --junitxml=lifecycle.iter${i}.report.xml lifecycle_test.py >> $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** lifecycle_test.py FAILED. \n***" + RET=1 + fi +done + +set -e + +kill_server + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages were not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + RET=1 +fi + +# These models have errors in the initialization and finalization +# steps and we want to ensure that correct error is being returned + +rm -rf models/ +mkdir -p models/init_error/1/ +cp ../../python_models/init_error/model.py ./models/init_error/1/ +cp ../../python_models/init_error/config.pbtxt ./models/init_error/ + +set +e +prev_num_pages=`get_shm_pages` +run_server_nowait + +wait $SERVER_PID +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages were not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + RET=1 +fi + +grep "name 'lorem_ipsum' is not defined" $SERVER_LOG + +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** init_error model test failed \n***" + RET=1 +fi +set -e + +# FIXME: Until we find a way to simulate Ctrl^C on windows, this +# test will not pass. +if [[ ${TEST_WINDOWS} == 0 ]]; then + rm -rf models/ + mkdir -p models/fini_error/1/ + cp ../../python_models/fini_error/model.py ./models/fini_error/1/ + cp ../../python_models/fini_error/config.pbtxt ./models/fini_error/ + + prev_num_pages=`get_shm_pages` + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + fi + + kill_server + + current_num_pages=`get_shm_pages` + if [ $current_num_pages -ne $prev_num_pages ]; then + cat $CLIENT_LOG + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages were not cleaned properly. + Shared memory pages before starting triton equals to $prev_num_pages + and shared memory pages after starting triton equals to $current_num_pages \n***" + RET=1 + fi + + set +e + grep "name 'undefined_variable' is not defined" $SERVER_LOG + + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** fini_error model test failed \n***" + RET=1 + fi + set -e +fi + +rm -rf models/ +mkdir -p models/auto_complete_error/1/ +cp ../../python_models/auto_complete_error/model.py ./models/auto_complete_error/1/ + +SERVER_ARGS="${SERVER_ARGS} --strict-model-config=false" + +set +e +prev_num_pages=`get_shm_pages` +run_server_nowait + +wait $SERVER_PID +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages were not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + RET=1 +fi + +set +e +grep "name 'undefined_variable' is not defined" $SERVER_LOG + +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** auto_complete_error model test failed \n***" + RET=1 +fi +set -e + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Lifecycle test FAILED. \n***" +else + echo -e "\n***\n*** Lifecycle test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/logging/logging_test.py b/qa/L0_backend_python/logging/logging_test.py new file mode 100755 index 0000000000..6be3125478 --- /dev/null +++ b/qa/L0_backend_python/logging/logging_test.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../../common") +import unittest + +import numpy as np +import shm_util +import tritonclient.http as httpclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class LogTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + + def test_log_output(self): + model_name = "identity_fp32_logging" + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = np.array([[1.0]], dtype=np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + self.assertIsNotNone(output0) + self.assertTrue(np.all(output0 == input_data)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/logging/test.sh b/qa/L0_backend_python/logging/test.sh new file mode 100755 index 0000000000..174f3e0140 --- /dev/null +++ b/qa/L0_backend_python/logging/test.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_LOG="logging_client.log" +TEST_RESULT_FILE="test_results.txt" +LOG_TEST="logging_test.py" +SERVER_LOG="./logging_server.log" + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +MODELSDIR=${MODELDIR}/logging/models +source ../../common/util.sh + +function verify_log_counts () { + non_verbose_expected=$1 + verbose_expected=$2 + + if [ `grep -c "Specific Msg!" $SERVER_LOG` != $non_verbose_expected ]; then + echo -e "\n***\n*** Test Failed: Specific Msg Count Incorrect\n***" + RET=1 + fi + if [ `grep -c "Info Msg!" $SERVER_LOG` != $non_verbose_expected ]; then + echo -e "\n***\n*** Test Failed: Info Msg Count Incorrect\n***" + RET=1 + fi + if [ `grep -c "Warning Msg!" $SERVER_LOG` != $non_verbose_expected ]; then + echo -e "\n***\n*** Test Failed: Warning Msg Count Incorrect\n***" + RET=1 + fi + if [ `grep -c "Error Msg!" $SERVER_LOG` != $non_verbose_expected ]; then + echo -e "\n***\n*** Test Failed: Error Msg Count Incorrect\n***" + RET=1 + fi + if [ `grep -c "Verbose Msg!" $SERVER_LOG` != $verbose_expected ]; then + echo -e "\n***\n*** Test Failed: Verbose Msg Count Incorrect\n***" + RET=1 + fi +} + +rm -f *.log + +# set up simple repository MODELBASE +rm -fr ${MODELSDIR} && mkdir -p ${MODELSDIR} && \ + python_model="identity_fp32_logging" + mkdir -p models/$python_model/1/ + cp ../../python_models/${python_model}/config.pbtxt models/${python_model}/config.pbtxt + cp ../../python_models/${python_model}/model.py models/${python_model}/1/ +RET=0 + +#Run Server with Default Log Settings +SERVER_ARGS="--model-repository=${MODELSDIR} --backend-directory=${BACKEND_DIR}" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SUBTEST="default" +python3 -m pytest --junitxml=log_test.${SUBTEST}.report.xml ${LOG_TEST} >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Check if correct # log messages are present [ non-verbose-msg-cnt | verbose-msg-cnt ] +# NOTE: Windows does not seem to have a way to send a true SIGINT signal +# to tritonserver. Instead, it seems required to use taskkill.exe with /F (force) +# to kill the running program. This means the server terminates immediately, +# instead of shutting down how it would if Ctrl^C was invoked from the terminal. +# To properly test functionality, we need a WAR. In the meantime, we will subtract +# 1 from the expected values to account for the fact that no logs will be emitted +# from the finalize function. +if [[ ${TEST_WINDOWS} == 1 ]]; then + verify_log_counts 3 0 +else + verify_log_counts 4 0 +fi + + +rm -f *.log +#Run Server Enabling Verbose Messages +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Enable verbose logging +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":1}' ${TRITONSERVER_IPADDR}:8000/v2/logging` + +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed: Could not Change Log Settings\n***" + RET=1 +fi + +SUBTEST="verbose" +python3 -m pytest --junitxml=log_test.${SUBTEST}.report.xml ${LOG_TEST} >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Verbose only 3 because model must initialize before +# log settings can be modified +if [[ ${TEST_WINDOWS} == 1 ]]; then + verify_log_counts 3 2 +else + verify_log_counts 4 3 +fi + +rm -f *.log +#Run Server Enabling Verbose Messages +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Disable all logging +BOOL_PARAMS=${BOOL_PARAMS:="log_info log_warning log_error"} +for BOOL_PARAM in $BOOL_PARAMS; do + # Attempt to use integer instead of bool + code=`curl -s -w %{http_code} -o ./curl.out -d'{"'"$BOOL_PARAM"'":false}' ${TRITONSERVER_IPADDR}:8000/v2/logging` + if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed: Could not Change Log Settings\n***" + RET=1 + fi +done + +SUBTEST="disabled" +python3 -m pytest --junitxml=log_test.${SUBTEST}.report.xml ${LOG_TEST} >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +# Will have 1 occurrence of each non-verbose log type +# because the server must initialize before log settings +# can be modified +# Same count for both Unix and Windows because this does +# not test log output in the finalize step. +verify_log_counts 1 0 + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Logging test PASSED. \n***" +else + echo -e "\n***\n*** Logging test FAILED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/model_control/model_control_test.py b/qa/L0_backend_python/model_control/model_control_test.py new file mode 100755 index 0000000000..9ccb73df4f --- /dev/null +++ b/qa/L0_backend_python/model_control/model_control_test.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../../common") + +import unittest + +import numpy as np +import shm_util +import tritonclient.http as httpclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class ExplicitModelTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + + def send_identity_request(self, client, model_name): + inputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32")) + input0_data = np.arange(start=0, stop=16, dtype=np.float32) + input0_data = np.expand_dims(input0_data, axis=0) + inputs[0].set_data_from_numpy(input0_data) + + with self._shm_leak_detector.Probe() as shm_probe: + result = client.infer( + model_name=model_name, + inputs=inputs, + outputs=[httpclient.InferRequestedOutput("OUTPUT0")], + ) + output_numpy = result.as_numpy("OUTPUT0") + self.assertTrue(np.all(input0_data == output_numpy)) + + def test_model_reload(self): + model_name = "identity_fp32" + ensemble_model_name = "simple_" + "identity_fp32" + with httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000") as client: + for _ in range(5): + self.assertFalse(client.is_model_ready(model_name)) + # Load the model before the ensemble model to make sure reloading the + # model works properly in Python backend. + client.load_model(model_name) + client.load_model(ensemble_model_name) + self.assertTrue(client.is_model_ready(model_name)) + self.assertTrue(client.is_model_ready(ensemble_model_name)) + self.send_identity_request(client, model_name) + self.send_identity_request(client, ensemble_model_name) + client.unload_model(ensemble_model_name) + client.unload_model(model_name) + self.assertFalse(client.is_model_ready(model_name)) + self.assertFalse(client.is_model_ready(ensemble_model_name)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/model_control/test.sh b/qa/L0_backend_python/model_control/test.sh new file mode 100755 index 0000000000..e2c22f2685 --- /dev/null +++ b/qa/L0_backend_python/model_control/test.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_LOG="./model_control_client.log" +TEST_RESULT_FILE='test_results.txt' +SERVER_ARGS="--model-repository=${MODELDIR}/model_control/models --model-control-mode=explicit --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./model_control_server.log" + +RET=0 +rm -fr *.log ./models + +source ../../common/util.sh + +mkdir -p models/identity_fp32/1/ +mkdir -p models/simple_identity_fp32/1/ +cp ../../python_models/identity_fp32/model.py ./models/identity_fp32/1/model.py +cp ../../python_models/identity_fp32/config.pbtxt ./models/identity_fp32/config.pbtxt +cp ../../python_models/simple_identity_fp32/config.pbtxt ./models/simple_identity_fp32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 -m pytest --junitxml=model_control.report.xml model_control_test.py 2>&1 > $CLIENT_LOG + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** model_control_test.py FAILED. \n***" + RET=1 +fi +set -e + +kill_server + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** model_control_test FAILED. \n***" +else + echo -e "\n***\n*** model_control_test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/python_based_backends/python_based_backends_test.py b/qa/L0_backend_python/python_based_backends/python_based_backends_test.py new file mode 100644 index 0000000000..24051e5217 --- /dev/null +++ b/qa/L0_backend_python/python_based_backends/python_based_backends_test.py @@ -0,0 +1,150 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +import unittest +from random import randint + +import numpy as np +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +sys.path.append("../../common") + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class PythonBasedBackendsTest(unittest.TestCase): + def setUp(self): + self.triton_client = grpcclient.InferenceServerClient( + url=f"{_tritonserver_ipaddr}:8001" + ) + self.add_sub_model_1 = "add" + self.add_sub_model_2 = "sub" + self.python_model = "add_sub" + self.pytorch_model = "add_sub_pytorch" + + self.triton_client.load_model( + self.add_sub_model_1, + config='{"backend":"add_sub","version_policy":{"latest":{"num_versions":2}}}', + ) + self.triton_client.load_model(self.add_sub_model_2) + self.triton_client.load_model(self.python_model) + self.triton_client.load_model(self.pytorch_model) + + def test_add_sub_models(self): + self.assertTrue( + self.triton_client.is_model_ready(self.add_sub_model_1, model_version="2") + ) + self._test_add_sub_model( + model_name=self.add_sub_model_1, model_version="2", single_output=True + ) + + self.assertTrue( + self.triton_client.is_model_ready(self.add_sub_model_1, model_version="1") + ) + self._test_add_sub_model( + model_name=self.add_sub_model_1, model_version="1", single_output=True + ) + + self.assertTrue(self.triton_client.is_model_ready(self.add_sub_model_2)) + self._test_add_sub_model(model_name=self.add_sub_model_2, single_output=True) + + def test_python_model(self): + self.assertTrue( + self.triton_client.is_model_ready(self.python_model, model_version="2") + ) + self._test_add_sub_model( + model_name=self.python_model, shape=[16], model_version="2" + ) + + def test_pytorch_model(self): + self.assertTrue( + self.triton_client.is_model_ready(self.pytorch_model, model_version="1") + ) + self._test_add_sub_model(model_name=self.pytorch_model) + + def _test_add_sub_model( + self, model_name, model_version="1", shape=[4], single_output=False + ): + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + + inputs = [ + grpcclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + grpcclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + if single_output: + outputs = [grpcclient.InferRequestedOutput("OUTPUT")] + + else: + outputs = [ + grpcclient.InferRequestedOutput("OUTPUT0"), + grpcclient.InferRequestedOutput("OUTPUT1"), + ] + + response = self.triton_client.infer( + model_name=model_name, + inputs=inputs, + model_version=model_version, + request_id=str(randint(10, 99)), + outputs=outputs, + ) + + if single_output: + if model_name == "add": + self.assertTrue( + np.allclose(input0_data + input1_data, response.as_numpy("OUTPUT")) + ) + else: + self.assertTrue( + np.allclose(input0_data - input1_data, response.as_numpy("OUTPUT")) + ) + else: + self.assertTrue( + np.allclose(input0_data + input1_data, response.as_numpy("OUTPUT0")) + ) + self.assertTrue( + np.allclose(input0_data - input1_data, response.as_numpy("OUTPUT1")) + ) + + def tearDown(self): + self.triton_client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/python_based_backends/test.sh b/qa/L0_backend_python/python_based_backends/test.sh new file mode 100755 index 0000000000..c6d55d6ed3 --- /dev/null +++ b/qa/L0_backend_python/python_based_backends/test.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../../common/util.sh + +QA_MODELS_PATH="../../python_models" +MODEL_REPOSITORY="${MODELDIR}/python_based_backends/models" +SERVER_ARGS="--model-repository=${MODEL_REPOSITORY} --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --log-verbose=1" +SERVER_LOG="./python_based_backends_server.log" +CLIENT_LOG="./python_based_backends_client.log" +TEST_RESULT_FILE="./test_results.txt" +CLIENT_PY="./python_based_backends_test.py" +GEN_PYTORCH_MODEL_PY="../../common/gen_qa_pytorch_model.py" +RET=0 + +rm -rf ${MODEL_REPOSITORY} +pip3 install torch + +# Setup add_sub backend and models +mkdir -p ${BACKEND_DIR}/add_sub +cp ${QA_MODELS_PATH}/python_based_backends/add_sub_backend/model.py ${BACKEND_DIR}/add_sub/model.py + +mkdir -p ${MODEL_REPOSITORY}/add/1/ +echo '{ "operation": "add" }' > ${MODEL_REPOSITORY}/add/1/model.json +echo "backend: \"add_sub\"" > ${MODEL_REPOSITORY}/add/config.pbtxt +cp -r ${MODEL_REPOSITORY}/add/1/ ${MODEL_REPOSITORY}/add/2/ + +mkdir -p ${MODEL_REPOSITORY}/sub/1/ +echo '{ "operation": "sub" }' > ${MODEL_REPOSITORY}/sub/1/model.json +echo "backend: \"add_sub\"" > ${MODEL_REPOSITORY}/sub/config.pbtxt + +# Setup python backend model +mkdir -p ${MODEL_REPOSITORY}/add_sub/1 +cp ${QA_MODELS_PATH}/add_sub/model.py ${MODEL_REPOSITORY}/add_sub/1/ +cp ${QA_MODELS_PATH}/add_sub/config.pbtxt ${MODEL_REPOSITORY}/add_sub/ +cp -r ${MODEL_REPOSITORY}/add_sub/1/ ${MODEL_REPOSITORY}/add_sub/2/ + +# Setup pytorch backend model +cp ${GEN_PYTORCH_MODEL_PY} ./gen_qa_pytorch_model.py +GEN_PYTORCH_MODEL_PY=./gen_qa_pytorch_model.py + +set +e +python3 ${GEN_PYTORCH_MODEL_PY} -m ${MODEL_REPOSITORY} + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Running ${GEN_PYTORCH_MODEL_PY} FAILED. \n***" + exit 1 +fi +set -e + +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +set +e +python3 -m pytest --junitxml=python_based_backends.report.xml ${CLIENT_PY} -v > ${CLIENT_LOG} 2>&1 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Running ${CLIENT_PY} FAILED. \n***" + RET=1 +fi +set -e + +kill_server +rm -rf ${MODEL_REPOSITORY} ${GEN_PYTORCH_MODEL_PY} + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Python-based Backends test FAILED. \n***" +else + echo -e "\n***\n*** Python-based Backends test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py new file mode 100755 index 0000000000..c2512600e2 --- /dev/null +++ b/qa/L0_backend_python/python_test.py @@ -0,0 +1,551 @@ +#!/usr/bin/python + +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import unittest + +import numpy as np +import requests as httpreq +import shm_util +import tritonclient.http as httpclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + +_test_jetson = bool(int(os.environ.get("TEST_JETSON", 0))) +_test_windows = bool(int(os.environ.get("TEST_WINDOWS", 0))) + + +class PythonTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + + def _infer_help(self, model_name, shape, data_type): + with httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000") as client: + input_data_0 = np.array(np.random.randn(*shape), dtype=data_type) + inputs = [ + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_data_0.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data_0) + + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + self.assertTrue(np.all(input_data_0 == output0)) + + def _create_cuda_region(self, client, size, name): + import tritonclient.utils.cuda_shared_memory as cuda_shared_memory + + shm0_handle = cuda_shared_memory.create_shared_memory_region( + name, byte_size=size, device_id=0 + ) + client.register_cuda_shared_memory( + name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size + ) + return shm0_handle + + def _optional_input_infer(self, model_name, has_input0, has_input1): + with httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000") as client: + shape = (1,) + if has_input0: + input0_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32) + else: + # Set the input0 to a default value if it is optional. This is + # the input used by the model if it is not provided. + input0_numpy = np.array([5], dtype=np.int32) + + if has_input1: + input1_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32) + else: + # Set the input1 to a default value if it is optional. This is + # the input used by the model if it is not provided. + input1_numpy = np.array([5], dtype=np.int32) + + inputs = [] + if has_input0: + inputs.append( + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input0_numpy.dtype) + ) + ) + inputs[-1].set_data_from_numpy(input0_numpy) + + if has_input1: + inputs.append( + httpclient.InferInput( + "INPUT1", shape, np_to_triton_dtype(input1_numpy.dtype) + ) + ) + inputs[-1].set_data_from_numpy(input1_numpy) + + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + self.assertIsNotNone(output0, "OUTPUT0 was not found.") + + output1 = result.as_numpy("OUTPUT1") + self.assertIsNotNone(output1, "OUTPUT1 was not found.") + + expected_output0 = input0_numpy + input1_numpy + expected_output1 = input0_numpy - input1_numpy + np.testing.assert_equal( + output0, expected_output0, "OUTPUT0 doesn't match expected OUTPUT0" + ) + np.testing.assert_equal( + output1, expected_output1, "OUTPUT1 doesn't match expected OUTPUT1" + ) + + def test_growth_error(self): + # NOTE: Windows tests are not running in a docker container. Consequently, we + # do not specify a --shm-size to use a basis to grow. Therefore, this test does + # not apply for Windows. + if not _test_windows: + # 2 MiBs + total_byte_size = 2 * 1024 * 1024 + shape = [total_byte_size] + model_name = "identity_uint8_nobatch" + dtype = np.uint8 + with self._shm_leak_detector.Probe() as shm_probe: + self._infer_help(model_name, shape, dtype) + + # 1 GiB payload leads to error in the main Python backend process. + # Total shared memory available is 1GiB. + total_byte_size = 1024 * 1024 * 1024 + shape = [total_byte_size] + with self.assertRaises(InferenceServerException) as ex: + self._infer_help(model_name, shape, dtype) + self.assertIn( + "Failed to increase the shared memory pool size", str(ex.exception) + ) + + # 512 MiBs payload leads to error in the Python stub process. + total_byte_size = 512 * 1024 * 1024 + shape = [total_byte_size] + with self.assertRaises(InferenceServerException) as ex: + self._infer_help(model_name, shape, dtype) + self.assertIn( + "Failed to increase the shared memory pool size", str(ex.exception) + ) + + # 2 MiBs + # Send a small paylaod to make sure it is still working properly + total_byte_size = 2 * 1024 * 1024 + shape = [total_byte_size] + with self._shm_leak_detector.Probe() as shm_probe: + self._infer_help(model_name, shape, dtype) + + # GPU tensors are not supported on jetson + # CUDA Shared memory is not supported on jetson + if not _test_jetson and not _test_windows: + + def test_gpu_tensor_error(self): + import tritonclient.utils.cuda_shared_memory as cuda_shared_memory + + model_name = "identity_bool" + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = np.array([[True] * 1000], dtype=bool) + inputs = [ + httpclient.InferInput( + "INPUT0", + input_data.shape, + np_to_triton_dtype(input_data.dtype), + ) + ] + inputs[0].set_data_from_numpy(input_data) + + requested_outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + + # intentionally create a shared memory region with not enough size. + client.unregister_cuda_shared_memory() + shm0_handle = self._create_cuda_region(client, 1, "output0_data") + + requested_outputs[0].set_shared_memory("output0_data", 1) + with self.assertRaises(InferenceServerException) as ex: + client.infer(model_name, inputs, outputs=requested_outputs) + self.assertIn( + "should be at least 1000 bytes to hold the results", + str(ex.exception), + ) + client.unregister_cuda_shared_memory() + cuda_shared_memory.destroy_shared_memory_region(shm0_handle) + + def test_dlpack_tensor_error(self): + import tritonclient.utils.cuda_shared_memory as cuda_shared_memory + + model_name = "dlpack_identity" + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = np.array([[1] * 1000], dtype=np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", + input_data.shape, + np_to_triton_dtype(input_data.dtype), + ) + ] + + requested_outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + input_data_size = input_data.itemsize * input_data.size + client.unregister_cuda_shared_memory() + input_region = self._create_cuda_region( + client, input_data_size, "input0_data" + ) + inputs[0].set_shared_memory("input0_data", input_data_size) + cuda_shared_memory.set_shared_memory_region( + input_region, [input_data] + ) + + # Intentionally create a small region to trigger an error + shm0_handle = self._create_cuda_region(client, 1, "output0_data") + requested_outputs[0].set_shared_memory("output0_data", 1) + + with self.assertRaises(InferenceServerException) as ex: + client.infer(model_name, inputs, outputs=requested_outputs) + self.assertIn( + "should be at least 4000 bytes to hold the results", + str(ex.exception), + ) + client.unregister_cuda_shared_memory() + cuda_shared_memory.destroy_shared_memory_region(shm0_handle) + + def test_async_infer(self): + model_name = "identity_uint8" + request_parallelism = 4 + shape = [2, 2] + + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000", concurrency=request_parallelism + ) as client: + input_datas = [] + requests = [] + for i in range(request_parallelism): + input_data = (16384 * np.random.randn(*shape)).astype(np.uint8) + input_datas.append(input_data) + inputs = [ + httpclient.InferInput( + "INPUT0", + input_data.shape, + np_to_triton_dtype(input_data.dtype), + ) + ] + inputs[0].set_data_from_numpy(input_data) + requests.append(client.async_infer(model_name, inputs)) + + for i in range(request_parallelism): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + results = requests[i].get_result() + + output_data = results.as_numpy("OUTPUT0") + self.assertIsNotNone(output_data, "error: expected 'OUTPUT0'") + self.assertTrue( + np.array_equal(output_data, input_datas[i]), + "error: expected output {} to match input {}".format( + output_data, input_datas[i] + ), + ) + + # Make sure the requests ran in parallel. + stats = client.get_inference_statistics(model_name) + test_cond = (len(stats["model_stats"]) != 1) or ( + stats["model_stats"][0]["name"] != model_name + ) + self.assertFalse( + test_cond, "error: expected statistics for {}".format(model_name) + ) + + stat = stats["model_stats"][0] + self.assertFalse( + (stat["inference_count"] != 8) or (stat["execution_count"] != 1), + "error: expected execution_count == 1 and inference_count == 8, got {} and {}".format( + stat["execution_count"], stat["inference_count"] + ), + ) + batch_stat = stat["batch_stats"][0] + self.assertFalse( + batch_stat["batch_size"] != 8, + f"error: expected batch_size == 8, got {batch_stat['batch_size']}", + ) + # Check metrics to make sure they are reported correctly + metrics = httpreq.get(f"http://{_tritonserver_ipaddr}:8002/metrics") + print(metrics.text) + + success_str = ( + 'nv_inference_request_success{model="identity_uint8",version="1"}' + ) + infer_count_str = ( + 'nv_inference_count{model="identity_uint8",version="1"}' + ) + infer_exec_str = ( + 'nv_inference_exec_count{model="identity_uint8",version="1"}' + ) + + success_val = None + infer_count_val = None + infer_exec_val = None + for line in metrics.text.splitlines(): + if line.startswith(success_str): + success_val = float(line[len(success_str) :]) + if line.startswith(infer_count_str): + infer_count_val = float(line[len(infer_count_str) :]) + if line.startswith(infer_exec_str): + infer_exec_val = float(line[len(infer_exec_str) :]) + + self.assertFalse( + success_val != 4, + "error: expected metric {} == 4, got {}".format( + success_str, success_val + ), + ) + self.assertFalse( + infer_count_val != 8, + "error: expected metric {} == 8, got {}".format( + infer_count_str, infer_count_val + ), + ) + self.assertFalse( + infer_exec_val != 1, + "error: expected metric {} == 1, got {}".format( + infer_exec_str, infer_exec_val + ), + ) + + def test_bool(self): + model_name = "identity_bool" + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = np.array([[True, False, True]], dtype=bool) + inputs = [ + httpclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + self.assertIsNotNone(output0) + self.assertTrue(np.all(output0 == input_data)) + + def test_bf16(self): + model_name = "identity_bf16" + shape = [2, 2] + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + # NOTE: Client will truncate FP32 to BF16 internally + # since numpy has no built-in BF16 representation. + np_input = np.ones(shape, dtype=np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", np_input.shape, "BF16" + ).set_data_from_numpy(np_input) + ] + result = client.infer(model_name, inputs) + + # Assert that Triton correctly returned a BF16 tensor. + response = result.get_response() + triton_output = response["outputs"][0] + triton_dtype = triton_output["datatype"] + self.assertEqual(triton_dtype, "BF16") + + np_output = result.as_numpy("OUTPUT0") + self.assertIsNotNone(np_output) + # BF16 tensors are held in FP32 when converted to numpy due to + # lack of native BF16 support in numpy, so verify that. + self.assertEqual(np_output.dtype, np.float32) + self.assertTrue(np.allclose(np_output, np_input)) + + def test_infer_pytorch(self): + # FIXME: This model requires torch. Because windows tests are not run in a docker + # environment with torch installed, we need to think about how we want to install + # the package. Do we install it on the runners? Within the model? + if not _test_windows: + model_name = "pytorch_fp32_fp32" + shape = [1, 1, 28, 28] + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = np.zeros(shape, dtype=np.float32) + inputs = [ + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + result = client.infer(model_name, inputs) + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + + # expected inference response from a zero tensor + expected_result = [ + -2.2377274, + -2.3976364, + -2.2464046, + -2.2790744, + -2.3828976, + -2.2940576, + -2.2928185, + -2.340665, + -2.275219, + -2.292135, + ] + self.assertTrue( + np.allclose(output_data[0], expected_result), + "Inference result is not correct", + ) + + def test_init_args(self): + model_name = "init_args" + shape = [2, 2] + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = np.zeros(shape, dtype=np.float32) + inputs = [ + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + result = client.infer(model_name, inputs) + # output response in this model is the number of keys in the args + self.assertTrue( + result.as_numpy("OUT") == 7, + "Number of keys in the init args is not correct", + ) + + def test_unicode(self): + model_name = "string" + shape = [1] + + # The first run will use np.bytes_ and the second run will use + # np.object_ + for i in range(2): + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + utf8 = "😀" + input_data = np.array( + [bytes(utf8, encoding="utf-8")], dtype=np.bytes_ + ) + inputs = [ + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + self.assertIsNotNone(output0) + self.assertEqual(output0[0], input_data) + + def test_optional_input(self): + model_name = "optional" + + with self._shm_leak_detector.Probe() as shm_probe: + for has_input0 in [True, False]: + for has_input1 in [True, False]: + self._optional_input_infer(model_name, has_input0, has_input1) + + def test_string(self): + model_name = "string_fixed" + shape = [1] + + # Test different string outputs. This test will send 4 requests to the + # backend. The model will return 4 responses (np.object_ and np.bytes) * + # (empty output and fixed output) + for i in range(4): + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_data = np.array(["123456"], dtype=np.object_) + inputs = [ + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + self.assertIsNotNone(output0) + + if i % 2 == 0: + self.assertEqual(output0[0], input_data.astype(np.bytes_)) + else: + self.assertEqual(output0.size, 0) + + def test_non_contiguous(self): + model_name = "non_contiguous" + shape = [2, 10, 11, 6, 5] + new_shape = [10, 2, 6, 5, 11] + shape_reorder = [1, 0, 4, 2, 3] + with self._shm_leak_detector.Probe() as shm_probe: + with httpclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8000" + ) as client: + input_numpy = np.random.rand(*shape) + input_numpy = input_numpy.astype(np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_numpy.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_numpy) + result = client.infer(model_name, inputs) + output0 = input_numpy.reshape(new_shape) + + # Transpose the tensor to create a non-contiguous tensor. + output1 = input_numpy.T + output2 = np.transpose(input_numpy, shape_reorder) + + self.assertTrue(np.all(output0 == result.as_numpy("OUTPUT0"))) + self.assertTrue(np.all(output1 == result.as_numpy("OUTPUT1"))) + self.assertTrue(np.all(output2 == result.as_numpy("OUTPUT2"))) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/request_rescheduling/grpc_endpoint_test.py b/qa/L0_backend_python/request_rescheduling/grpc_endpoint_test.py new file mode 100755 index 0000000000..9dcb648d87 --- /dev/null +++ b/qa/L0_backend_python/request_rescheduling/grpc_endpoint_test.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../../common") + +# GRPC streaming helpers.. +import queue +import unittest +from functools import partial + +import numpy as np +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class GrpcEndpointTest(unittest.TestCase): + def test_grpc_decoupled(self, sequence_id=0, sequence_start=False): + user_data = UserData() + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as triton_client: + # Reload the model to reset the flag + triton_client.unload_model("iterative_sequence") + triton_client.load_model("iterative_sequence") + + triton_client.start_stream(callback=partial(callback, user_data)) + inputs = [] + inputs.append(grpcclient.InferInput("IN", [1], "INT32")) + inputs[0].set_data_from_numpy(np.array([3], dtype=np.int32)) + + triton_client.async_stream_infer( + model_name="iterative_sequence", + inputs=inputs, + sequence_id=sequence_id, + sequence_start=sequence_start, + ) + res_count = 3 + while res_count > 0: + data_item = user_data._completed_requests.get() + res_count -= 1 + if type(data_item) == InferenceServerException: + raise data_item + else: + self.assertEqual(res_count, data_item.as_numpy("OUT")[0]) + self.assertEqual(0, res_count) + + def test_grpc_non_decoupled(self, sequence_id=0, sequence_start=False): + with grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) as triton_client: + # Reload the model to reset the flag + triton_client.unload_model("request_rescheduling_addsub") + triton_client.load_model("request_rescheduling_addsub") + + inputs = [] + inputs.append(grpcclient.InferInput("INPUT0", [16], "FP32")) + inputs.append(grpcclient.InferInput("INPUT1", [16], "FP32")) + input0_val = np.random.randn(*[16]).astype(np.float32) + input1_val = np.random.randn(*[16]).astype(np.float32) + inputs[0].set_data_from_numpy(input0_val) + inputs[1].set_data_from_numpy(input1_val) + + results = triton_client.infer( + model_name="request_rescheduling_addsub", + inputs=inputs, + ) + + output0_data = results.as_numpy("OUTPUT0") + output1_data = results.as_numpy("OUTPUT1") + + self.assertTrue(np.array_equal(output0_data, input0_val + input1_val)) + self.assertTrue(np.array_equal(output1_data, input0_val - input1_val)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/request_rescheduling/test.sh b/qa/L0_backend_python/request_rescheduling/test.sh new file mode 100755 index 0000000000..31ba6692d9 --- /dev/null +++ b/qa/L0_backend_python/request_rescheduling/test.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_PY="../test_infer_shm_leak.py" +CLIENT_LOG="./request_rescheduling_client.log" +TEST_RESULT_FILE='test_results.txt' +source ../../common/util.sh + +RET=0 + +rm -fr *.log ./models *.txt + +mkdir -p models/bls_request_rescheduling/1/ +cp ../../python_models/bls_request_rescheduling/model.py models/bls_request_rescheduling/1/ +cp ../../python_models/bls_request_rescheduling/config.pbtxt models/bls_request_rescheduling + +mkdir -p models/request_rescheduling_addsub/1/ +cp ../../python_models/request_rescheduling_addsub/model.py models/request_rescheduling_addsub/1/ +cp ../../python_models/request_rescheduling_addsub/config.pbtxt models/request_rescheduling_addsub + +mkdir -p models/iterative_sequence/1/ +cp ../../python_models/iterative_sequence/model.py models/iterative_sequence/1/ +cp ../../python_models/iterative_sequence/config.pbtxt models/iterative_sequence + +mkdir -p models/wrong_return_type/1/ +cp ../../python_models/wrong_return_type/model.py models/wrong_return_type/1/ +cp ../../python_models/wrong_return_type/config.pbtxt models/wrong_return_type + +SERVER_LOG="./request_rescheduling_server.log" +SERVER_ARGS="--model-repository=${MODELDIR}/request_rescheduling/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=* --log-verbose=1" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +export MODEL_NAME='bls_request_rescheduling' + +set +e +python3 -m pytest --junitxml="${MODEL_NAME}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** bls_request_rescheduling test FAILED. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +GRPC_TEST_PY=./grpc_endpoint_test.py + +set +e +python3 -m pytest --junitxml="grpc_request_reschedule.report.xml" ${GRPC_TEST_PY} >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** GRPC Endpoint test FAILED. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + + +if [ $RET -eq 1 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Request Rescheduling test FAILED. \n***" +else + echo -e "\n***\n*** Request Rescheduling test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py new file mode 100644 index 0000000000..386a54e3d3 --- /dev/null +++ b/qa/L0_backend_python/response_sender/response_sender_complete_final_test.py @@ -0,0 +1,77 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient + + +class ResponseSenderTest(unittest.TestCase): + def _generate_streaming_callback_and_responses_pair(self): + responses = [] # [{"result": result, "error": error}, ...] + + def callback(result, error): + responses.append({"result": result, "error": error}) + + return callback, responses + + def test_respond_after_complete_final(self): + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Test Passed", server_log) + + model_name = "response_sender_complete_final" + shape = [1, 1] + inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")] + input0_np = np.array([[123.45]], np.float32) + inputs[0].set_data_from_numpy(input0_np) + + callback, responses = self._generate_streaming_callback_and_responses_pair() + with grpcclient.InferenceServerClient("localhost:8001") as client: + client.start_stream(callback) + client.async_stream_infer(model_name, inputs) + client.stop_stream() + + self.assertEqual(len(responses), 1) + for response in responses: + output0_np = response["result"].as_numpy(name="OUTPUT0") + self.assertTrue(np.allclose(input0_np, output0_np)) + self.assertIsNone(response["error"]) + + time.sleep(1) # make sure the logs are written before checking + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Unexpected request length", server_log) + self.assertNotIn("Expected exception not raised", server_log) + self.assertNotIn("Test FAILED", server_log) + self.assertIn("Test Passed", server_log) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/response_sender/response_sender_test.py b/qa/L0_backend_python/response_sender/response_sender_test.py new file mode 100644 index 0000000000..81f8c75f2c --- /dev/null +++ b/qa/L0_backend_python/response_sender/response_sender_test.py @@ -0,0 +1,583 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class ResponseSenderTest(unittest.TestCase): + _inputs_parameters_zero_response_pre_return = { + "number_of_response_before_return": 0, + "send_complete_final_flag_before_return": True, + "return_a_response": False, + "number_of_response_after_return": 0, + "send_complete_final_flag_after_return": False, + } + _inputs_parameters_zero_response_post_return = { + "number_of_response_before_return": 0, + "send_complete_final_flag_before_return": False, + "return_a_response": False, + "number_of_response_after_return": 0, + "send_complete_final_flag_after_return": True, + } + _inputs_parameters_one_response_pre_return = { + "number_of_response_before_return": 1, + "send_complete_final_flag_before_return": True, + "return_a_response": False, + "number_of_response_after_return": 0, + "send_complete_final_flag_after_return": False, + } + _inputs_parameters_one_response_post_return = { + "number_of_response_before_return": 0, + "send_complete_final_flag_before_return": False, + "return_a_response": False, + "number_of_response_after_return": 1, + "send_complete_final_flag_after_return": True, + } + _inputs_parameters_two_response_pre_return = { + "number_of_response_before_return": 2, + "send_complete_final_flag_before_return": True, + "return_a_response": False, + "number_of_response_after_return": 0, + "send_complete_final_flag_after_return": False, + } + _inputs_parameters_two_response_post_return = { + "number_of_response_before_return": 0, + "send_complete_final_flag_before_return": False, + "return_a_response": False, + "number_of_response_after_return": 2, + "send_complete_final_flag_after_return": True, + } + _inputs_parameters_response_pre_and_post_return = { + "number_of_response_before_return": 1, + "send_complete_final_flag_before_return": False, + "return_a_response": False, + "number_of_response_after_return": 3, + "send_complete_final_flag_after_return": True, + } + _inputs_parameters_one_response_on_return = { + "number_of_response_before_return": 0, + "send_complete_final_flag_before_return": False, + "return_a_response": True, + "number_of_response_after_return": 0, + "send_complete_final_flag_after_return": False, + } + _inputs_parameters_one_response_pre_and_on_return = { + "number_of_response_before_return": 1, + "send_complete_final_flag_before_return": True, + "return_a_response": True, + "number_of_response_after_return": 0, + "send_complete_final_flag_after_return": False, + } + _inputs_parameters_one_response_on_and_post_return = { + "number_of_response_before_return": 0, + "send_complete_final_flag_before_return": False, + "return_a_response": True, + "number_of_response_after_return": 1, + "send_complete_final_flag_after_return": True, + } + + def _get_inputs( + self, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ): + shape = [1, 1] + inputs = [ + grpcclient.InferInput("NUMBER_OF_RESPONSE_BEFORE_RETURN", shape, "UINT8"), + grpcclient.InferInput( + "SEND_COMPLETE_FINAL_FLAG_BEFORE_RETURN", shape, "BOOL" + ), + grpcclient.InferInput("RETURN_A_RESPONSE", shape, "BOOL"), + grpcclient.InferInput("NUMBER_OF_RESPONSE_AFTER_RETURN", shape, "UINT8"), + grpcclient.InferInput( + "SEND_COMPLETE_FINAL_FLAG_AFTER_RETURN", shape, "BOOL" + ), + ] + inputs[0].set_data_from_numpy( + np.array([[number_of_response_before_return]], np.uint8) + ) + inputs[1].set_data_from_numpy( + np.array([[send_complete_final_flag_before_return]], bool) + ) + inputs[2].set_data_from_numpy(np.array([[return_a_response]], bool)) + inputs[3].set_data_from_numpy( + np.array([[number_of_response_after_return]], np.uint8) + ) + inputs[4].set_data_from_numpy( + np.array([[send_complete_final_flag_after_return]], bool) + ) + return inputs + + def _generate_streaming_callback_and_responses_pair(self): + responses = [] # [{"result": result, "error": error}, ...] + + def callback(result, error): + responses.append({"result": result, "error": error}) + + return callback, responses + + def _infer_parallel(self, model_name, parallel_inputs): + callback, responses = self._generate_streaming_callback_and_responses_pair() + with grpcclient.InferenceServerClient("localhost:8001") as client: + client.start_stream(callback) + for inputs in parallel_inputs: + client.async_stream_infer(model_name, inputs) + client.stop_stream() + return responses + + def _infer( + self, + model_name, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ): + inputs = self._get_inputs( + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ) + return self._infer_parallel(model_name, [inputs]) + + def _assert_responses_valid( + self, + responses, + number_of_response_before_return, + return_a_response, + number_of_response_after_return, + ): + before_return_response_count = 0 + response_returned = False + after_return_response_count = 0 + for response in responses: + result, error = response["result"], response["error"] + self.assertIsNone(error) + result_np = result.as_numpy(name="INDEX") + response_id = result_np.sum() / result_np.shape[0] + if response_id < 1000: + self.assertFalse( + response_returned, + "Expect at most one response returned per request.", + ) + response_returned = True + elif response_id < 2000: + before_return_response_count += 1 + elif response_id < 3000: + after_return_response_count += 1 + else: + raise ValueError(f"Unexpected response_id: {response_id}") + self.assertEqual(number_of_response_before_return, before_return_response_count) + self.assertEqual(return_a_response, response_returned) + self.assertEqual(number_of_response_after_return, after_return_response_count) + + def _assert_responses_exception(self, responses, expected_message): + for response in responses: + self.assertIsNone(response["result"]) + self.assertIsInstance(response["error"], InferenceServerException) + self.assertIn(expected_message, response["error"].message()) + # There may be more responses, but currently only sees one for all tests. + self.assertEqual(len(responses), 1) + + def _assert_decoupled_infer_success( + self, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ): + model_name = "response_sender_decoupled" + responses = self._infer( + model_name, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ) + self._assert_responses_valid( + responses, + number_of_response_before_return, + return_a_response, + number_of_response_after_return, + ) + # Do NOT group into a for-loop as it hides which model failed. + model_name = "response_sender_decoupled_async" + responses = self._infer( + model_name, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ) + self._assert_responses_valid( + responses, + number_of_response_before_return, + return_a_response, + number_of_response_after_return, + ) + + def _assert_non_decoupled_infer_with_expected_response_success( + self, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + expected_number_of_response_before_return, + expected_return_a_response, + expected_number_of_response_after_return, + ): + model_name = "response_sender" + responses = self._infer( + model_name, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ) + self._assert_responses_valid( + responses, + expected_number_of_response_before_return, + expected_return_a_response, + expected_number_of_response_after_return, + ) + # Do NOT group into a for-loop as it hides which model failed. + model_name = "response_sender_async" + responses = self._infer( + model_name, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ) + self._assert_responses_valid( + responses, + expected_number_of_response_before_return, + expected_return_a_response, + expected_number_of_response_after_return, + ) + + def _assert_non_decoupled_infer_success( + self, + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + ): + self._assert_non_decoupled_infer_with_expected_response_success( + number_of_response_before_return, + send_complete_final_flag_before_return, + return_a_response, + number_of_response_after_return, + send_complete_final_flag_after_return, + expected_number_of_response_before_return=number_of_response_before_return, + expected_return_a_response=return_a_response, + expected_number_of_response_after_return=number_of_response_after_return, + ) + + # Decoupled model send response final flag before request return. + def test_decoupled_zero_response_pre_return(self): + self._assert_decoupled_infer_success( + **self._inputs_parameters_zero_response_pre_return + ) + + # Decoupled model send response final flag after request return. + def test_decoupled_zero_response_post_return(self): + self._assert_decoupled_infer_success( + **self._inputs_parameters_zero_response_post_return + ) + + # Decoupled model send 1 response before request return. + def test_decoupled_one_response_pre_return(self): + self._assert_decoupled_infer_success( + **self._inputs_parameters_one_response_pre_return + ) + + # Decoupled model send 1 response after request return. + def test_decoupled_one_response_post_return(self): + self._assert_decoupled_infer_success( + **self._inputs_parameters_one_response_post_return + ) + + # Decoupled model send 2 response before request return. + def test_decoupled_two_response_pre_return(self): + self._assert_decoupled_infer_success( + **self._inputs_parameters_two_response_pre_return + ) + + # Decoupled model send 2 response after request return. + def test_decoupled_two_response_post_return(self): + self._assert_decoupled_infer_success( + **self._inputs_parameters_two_response_post_return + ) + + # Decoupled model send 1 and 3 responses before and after return. + def test_decoupled_response_pre_and_post_return(self): + self._assert_decoupled_infer_success( + **self._inputs_parameters_response_pre_and_post_return + ) + + # Non-decoupled model send 1 response on return. + def test_non_decoupled_one_response_on_return(self): + self._assert_non_decoupled_infer_success( + **self._inputs_parameters_one_response_on_return + ) + + # Non-decoupled model send 1 response before return. + def test_non_decoupled_one_response_pre_return(self): + self._assert_non_decoupled_infer_success( + **self._inputs_parameters_one_response_pre_return + ) + + # Non-decoupled model send 1 response after return. + def test_non_decoupled_one_response_post_return(self): + self._assert_non_decoupled_infer_success( + **self._inputs_parameters_one_response_post_return + ) + + # Decoupled model requests each responding differently. + def test_decoupled_multiple_requests(self): + parallel_inputs = [ + self._get_inputs(**self._inputs_parameters_zero_response_pre_return), + self._get_inputs(**self._inputs_parameters_zero_response_post_return), + self._get_inputs(**self._inputs_parameters_one_response_pre_return), + self._get_inputs(**self._inputs_parameters_one_response_post_return), + self._get_inputs(**self._inputs_parameters_two_response_pre_return), + self._get_inputs(**self._inputs_parameters_two_response_post_return), + self._get_inputs(**self._inputs_parameters_response_pre_and_post_return), + ] + expected_number_of_response_before_return = 4 + expected_return_a_response = False + expected_number_of_response_after_return = 6 + + model_name = "response_sender_decoupled_batching" + responses = self._infer_parallel(model_name, parallel_inputs) + self._assert_responses_valid( + responses, + expected_number_of_response_before_return, + expected_return_a_response, + expected_number_of_response_after_return, + ) + # Do NOT group into a for-loop as it hides which model failed. + model_name = "response_sender_decoupled_async_batching" + responses = self._infer_parallel(model_name, parallel_inputs) + self._assert_responses_valid( + responses, + expected_number_of_response_before_return, + expected_return_a_response, + expected_number_of_response_after_return, + ) + + # Non-decoupled model requests each responding differently. + def test_non_decoupled_multiple_requests(self): + parallel_inputs = [ + self._get_inputs(**self._inputs_parameters_one_response_on_return), + self._get_inputs(**self._inputs_parameters_one_response_pre_return), + self._get_inputs(**self._inputs_parameters_one_response_post_return), + ] + expected_number_of_response_before_return = 1 + expected_return_a_response = True + expected_number_of_response_after_return = 1 + + model_name = "response_sender_batching" + responses = self._infer_parallel(model_name, parallel_inputs) + self._assert_responses_valid( + responses, + expected_number_of_response_before_return, + expected_return_a_response, + expected_number_of_response_after_return, + ) + # Do NOT group into a for-loop as it hides which model failed. + model_name = "response_sender_async_batching" + responses = self._infer_parallel(model_name, parallel_inputs) + self._assert_responses_valid( + responses, + expected_number_of_response_before_return, + expected_return_a_response, + expected_number_of_response_after_return, + ) + + # Decoupled model send 1 response on return. + def test_decoupled_one_response_on_return(self): + responses = self._infer( + model_name="response_sender_decoupled", + **self._inputs_parameters_one_response_on_return, + ) + self._assert_responses_exception( + responses, + expected_message="using the decoupled mode and the execute function must return None", + ) + # TODO: Test for async decoupled after fixing 'AsyncEventFutureDoneCallback' + # using `py_future.result()` with error hangs on exit. + + # Decoupled model send 1 response and return 1 response. + def test_decoupled_one_response_pre_and_on_return(self): + # Note: The before return response will send a valid response and close the + # response sender. Then, returning a response will generate an error, but + # since the response sender is closed, nothing is passed to the client. + responses = self._infer( + model_name="response_sender_decoupled", + **self._inputs_parameters_one_response_pre_and_on_return, + ) + self._assert_responses_valid( + responses, + number_of_response_before_return=1, + return_a_response=0, + number_of_response_after_return=0, + ) + # TODO: Test for async decoupled after fixing 'AsyncEventFutureDoneCallback' + # using `py_future.result()` with error hangs on exit. + + # Decoupled model return 1 response and send 1 response. + def test_decoupled_one_response_on_and_post_return(self): + # Note: The returned response will send an error response and complete final + # flag, and close the response sender and factory. Then, sending a + # response will raise an exception. Since the exception happens after the + # model returns, it cannot be caught by the stub (i.e. in a daemon + # thread), so nothing will happen. + responses = self._infer( + model_name="response_sender_decoupled", + **self._inputs_parameters_one_response_on_and_post_return, + ) + self._assert_responses_exception( + responses, + expected_message="using the decoupled mode and the execute function must return None", + ) + # TODO: Test for async decoupled after fixing 'AsyncEventFutureDoneCallback' + # using `py_future.result()` with error hangs on exit. + + # Non-decoupled model send response final flag before request return. + def test_non_decoupled_zero_response_pre_return(self): + # Note: The final flag will raise an exception which stops the model. Since the + # exception happens before the model returns, it will be caught by the + # stub process which pass it to the backend and sent an error response + # with final flag. + expected_message = ( + "Non-decoupled model cannot send complete final before sending a response" + ) + model_name = "response_sender" + responses = self._infer( + model_name, + **self._inputs_parameters_zero_response_pre_return, + ) + self._assert_responses_exception(responses, expected_message) + # Do NOT group into a for-loop as it hides which model failed. + model_name = "response_sender_async" + responses = self._infer( + model_name, + **self._inputs_parameters_zero_response_pre_return, + ) + self._assert_responses_exception(responses, expected_message) + + # Non-decoupled model send response final flag after request return. + @unittest.skip("Model unload will hang, see the TODO comment.") + def test_non_decoupled_zero_response_post_return(self): + # Note: The final flag will raise an exception which stops the model. Since the + # exception happens after the model returns, it cannot be caught by the + # stub (i.e. in a daemon thread), so nothing will happen. + # TODO: Since the stub does not know if the model failed after returning, the + # complete final flag is not sent and will hang when unloading the model. + # How to detect such event and close the response factory? + raise NotImplementedError("No testing is performed") + + # Non-decoupled model send 2 response before return. + def test_non_decoupled_two_response_pre_return(self): + # Note: The 1st response will make its way to the client, but sending the 2nd + # response will raise an exception which stops the model. Since the + # exception happens before the model returns, it will be caught by the + # stub process which pass it to the backend and sent an error response + # with final flag. Since this is non-decoupled model using gRPC stream, + # any response after the 1st will be discarded by the frontend. + self._assert_non_decoupled_infer_with_expected_response_success( + **self._inputs_parameters_two_response_pre_return, + expected_number_of_response_before_return=1, + expected_return_a_response=False, + expected_number_of_response_after_return=0, + ) + + # Non-decoupled model send 2 response after return. + @unittest.skip("Model unload will hang, see the TODO comment.") + def test_non_decoupled_two_response_post_return(self): + # Note: The 1st response will make its way to the client, but sending the 2nd + # response will raise an exception which stops the model. Since the + # exception happens after the model returns, it cannot be caught by the + # stub (i.e. in a daemon thread), so nothing will happen. + # TODO: Since the stub does not know if the model failed after returning, the + # complete final flag is not sent and will hang when unloading the model. + # How to detect such event and close the response factory? + self._assert_non_decoupled_infer_with_expected_response_success( + **self._inputs_parameters_two_response_post_return, + expected_number_of_response_before_return=0, + expected_return_a_response=False, + expected_number_of_response_after_return=1, + ) + + # Non-decoupled model send 1 response and return 1 response. + def test_non_decoupled_one_response_pre_and_on_return(self): + # Note: The sent response will make its way to the client and complete final. + # The returned response will see the response sender is closed and raise + # an exception. The backend should see the request is closed and do + # nothing upon receiving the error from stub. + self._assert_non_decoupled_infer_with_expected_response_success( + **self._inputs_parameters_one_response_pre_and_on_return, + expected_number_of_response_before_return=1, + expected_return_a_response=False, + expected_number_of_response_after_return=0, + ) + + # Non-decoupled model return 1 response and send 1 response. + def test_non_decoupled_one_response_on_and_post_return(self): + # Note: The returned response will send the response to the client and complete + # final. The sent response will see the response sender is closed and + # raise an exception. Since the exception happens after the model returns, + # it cannot be caught by the stub (i.e. in a daemon thread), so nothing + # will happen. + self._assert_non_decoupled_infer_with_expected_response_success( + **self._inputs_parameters_one_response_on_and_post_return, + expected_number_of_response_before_return=0, + expected_return_a_response=True, + expected_number_of_response_after_return=0, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/response_sender/test.sh b/qa/L0_backend_python/response_sender/test.sh new file mode 100755 index 0000000000..cca7e7acfa --- /dev/null +++ b/qa/L0_backend_python/response_sender/test.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../../common/util.sh + +RET=0 + +# +# Test response sender under decoupled / non-decoupled +# +rm -rf models && mkdir models +mkdir -p models/response_sender/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender/1 && \ + cp ../../python_models/response_sender/model.py models/response_sender/1 && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender +mkdir -p models/response_sender_decoupled/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender_decoupled/1 && \ + cp ../../python_models/response_sender/model.py models/response_sender_decoupled/1 && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender_decoupled && \ + echo "model_transaction_policy { decoupled: True }" >> models/response_sender_decoupled/config.pbtxt +mkdir -p models/response_sender_async/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender_async/1 && \ + cp ../../python_models/response_sender/model_async.py models/response_sender_async/1/model.py && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender_async +mkdir -p models/response_sender_decoupled_async/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender_decoupled_async/1 && \ + cp ../../python_models/response_sender/model_async.py models/response_sender_decoupled_async/1/model.py && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender_decoupled_async && \ + echo "model_transaction_policy { decoupled: True }" >> models/response_sender_decoupled_async/config.pbtxt +mkdir -p models/response_sender_batching/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender_batching/1 && \ + cp ../../python_models/response_sender/model.py models/response_sender_batching/1 && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender_batching && \ + echo "dynamic_batching { max_queue_delay_microseconds: 500000 }" >> models/response_sender_batching/config.pbtxt +mkdir -p models/response_sender_decoupled_batching/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender_decoupled_batching/1 && \ + cp ../../python_models/response_sender/model.py models/response_sender_decoupled_batching/1 && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender_decoupled_batching && \ + echo "model_transaction_policy { decoupled: True }" >> models/response_sender_decoupled_batching/config.pbtxt && \ + echo "dynamic_batching { max_queue_delay_microseconds: 500000 }" >> models/response_sender_decoupled_batching/config.pbtxt +mkdir -p models/response_sender_async_batching/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender_async_batching/1 && \ + cp ../../python_models/response_sender/model_async.py models/response_sender_async_batching/1/model.py && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender_async_batching && \ + echo "dynamic_batching { max_queue_delay_microseconds: 500000 }" >> models/response_sender_async_batching/config.pbtxt +mkdir -p models/response_sender_decoupled_async_batching/1 && \ + cp ../../python_models/response_sender/model_common.py models/response_sender_decoupled_async_batching/1 && \ + cp ../../python_models/response_sender/model_async.py models/response_sender_decoupled_async_batching/1/model.py && \ + cp ../../python_models/response_sender/config.pbtxt models/response_sender_decoupled_async_batching && \ + echo "model_transaction_policy { decoupled: True }" >> models/response_sender_decoupled_async_batching/config.pbtxt && \ + echo "dynamic_batching { max_queue_delay_microseconds: 500000 }" >> models/response_sender_decoupled_async_batching/config.pbtxt + +TEST_LOG="response_sender_test.log" +SERVER_LOG="response_sender_test.server.log" +SERVER_ARGS="--model-repository=${MODELDIR}/response_sender/models --backend-directory=${BACKEND_DIR} --log-verbose=1" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=concurrency_test.report.xml response_sender_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** response sender test FAILED\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# +# Test response sender to raise exception on response after complete final flag +# +rm -rf models && mkdir models +mkdir -p models/response_sender_complete_final/1 && \ + cp ../../python_models/response_sender_complete_final/model.py models/response_sender_complete_final/1 && \ + cp ../../python_models/response_sender_complete_final/config.pbtxt models/response_sender_complete_final + +TEST_LOG="response_sender_complete_final_test.log" +SERVER_LOG="response_sender_complete_final_test.server.log" +SERVER_ARGS="--model-repository=${MODELDIR}/response_sender/models --backend-directory=${BACKEND_DIR} --log-verbose=1" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=concurrency_test.report.xml response_sender_complete_final_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** response sender complete final test FAILED\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# +# Test async response sender under decoupled / non-decoupled +# + +# TODO + +if [ $RET -eq 1 ]; then + echo -e "\n***\n*** Response sender test FAILED\n***" +else + echo -e "\n***\n*** Response sender test Passed\n***" +fi +exit $RET diff --git a/qa/L0_backend_python/restart/models/restart/1/model.py b/qa/L0_backend_python/restart/models/restart/1/model.py new file mode 100644 index 0000000000..1f7491498e --- /dev/null +++ b/qa/L0_backend_python/restart/models/restart/1/model.py @@ -0,0 +1,58 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from os import path + +import c_python_backend_utils as c_utils +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + # This function will be called once to record the free memory. Then, + # the stub process will be killed to trigger Python backend restart. + # After that this value will be read again to make sure that it matches + # before restart. + + file_name = "free_memory.txt" + current_free_memory = str(c_utils.shared_memory.free_memory()) + if path.exists(file_name): + with open(file_name, "r") as f: + expected_free_memory = f.read() + assert expected_free_memory == current_free_memory, ( + f"Free shared memory before and after restart are not equal. " + "{expected_free_memory} (before) != {current_free_memory} (after)." + ) + else: + with open(file_name, "w") as f: + f.write(current_free_memory) + + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/L0_backend_python/restart/models/restart/config.pbtxt b/qa/L0_backend_python/restart/models/restart/config.pbtxt new file mode 100644 index 0000000000..7eb69ce275 --- /dev/null +++ b/qa/L0_backend_python/restart/models/restart/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "restart" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_backend_python/restart/restart_test.py b/qa/L0_backend_python/restart/restart_test.py new file mode 100755 index 0000000000..585548608f --- /dev/null +++ b/qa/L0_backend_python/restart/restart_test.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../../common") + +import unittest + +import numpy as np +import shm_util +import tritonclient.http as httpclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class RestartTest(unittest.TestCase): + def setUp(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + + def _infer_helper(self, model_name, shape, data_type): + with httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000") as client: + input_data_0 = np.array(np.random.randn(*shape), dtype=data_type) + inputs = [ + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_data_0.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data_0) + result = client.infer(model_name, inputs) + output0 = result.as_numpy("OUTPUT0") + self.assertTrue(np.all(input_data_0 == output0)) + + def test_restart(self): + shape = [1, 16] + model_name = "restart" + dtype = np.float32 + + # Since the stub process has been killed, the first request + # will return an exception. + with self.assertRaises(InferenceServerException): + # FIXME: No leak check here as the unhealthy stub error likely causes issues. + # tritonclient.utils.InferenceServerException: [400] Failed to + # process the request(s) for model instance 'restart_0_0', + # message: Stub process 'restart_0_0' is not healthy. + # [restart] Shared memory leak detected: 1007216 (current) > 1007056 (prev). + self._infer_helper(model_name, shape, dtype) + + # The second request should work properly since the stub process should + # have come alive. + with self._shm_leak_detector.Probe() as shm_probe: + self._infer_helper(model_name, shape, dtype) + + def test_infer(self): + shape = [1, 16] + model_name = "restart" + dtype = np.float32 + with self._shm_leak_detector.Probe() as shm_probe: + self._infer_helper(model_name, shape, dtype) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/restart/test.sh b/qa/L0_backend_python/restart/test.sh new file mode 100755 index 0000000000..5b14c280b5 --- /dev/null +++ b/qa/L0_backend_python/restart/test.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_LOG="./restart_client.log" +SERVER_ARGS="--model-repository=${MODELDIR}/restart/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_LOG="./restart_server.log" +source ../../common/util.sh +source ../common.sh + +rm -fr *.log free_memory.txt + +RET=0 + +prev_num_pages=`get_shm_pages` +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SUBTEST="test_infer" +python3 -m pytest --junitxml=restart.${SUBTEST}.report.xml restart_test.py::RestartTest::${SUBTEST} >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** ${SUBTEST} test FAILED. \n***" + RET=1 +fi +set -e + +# NOTE: with the current setup, tritonserver is launched within wsl, but the stub is started +# in Windows. Therefore, finding the PID of the stub requires a bit more work. +if [[ ${TEST_WINDOWS} == 1 ]]; then + tasklist=$(/mnt/c/windows/system32/tasklist.exe /FI 'IMAGENAME eq triton_python_backend_stub.exe' /FO CSV) + taskcount=$(echo "$tasklist" | grep -c triton_python_backend_stub) + if [[ $taskcount > 0 ]]; then + echo "$tasklist" | while IFS=, read -r taskname taskpid taskrest; do + if [[ "$taskname" == "\"triton_python_backend_stub.exe\"" ]]; then + taskpid="${taskpid%\"}" + taskpid="${taskpid#\"}" + /mnt/c/windows/system32/taskkill.exe /PID $taskpid /F /T + fi + done + fi +else + triton_procs=$(pgrep --parent $SERVER_PID) + echo $triton_procs + for proc in $triton_procs; do + kill -9 $proc + done +fi + +set +e + +SUBTEST="test_restart" +python3 -m pytest --junitxml=restart.${SUBTEST}.report.xml restart_test.py::RestartTest::${SUBTEST} >> $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** ${SUBTEST} test FAILED. \n***" + RET=1 +fi +set -e + +kill_server + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + cat $CLIENT_LOG + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + exit 1 +fi + +# Test if the Triton server exits gracefully when the stub has been killed. +rm $SERVER_LOG +prev_num_pages=`get_shm_pages` +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +triton_procs=`pgrep --parent $SERVER_PID` +echo $triton_procs + +set +e +for proc in $triton_procs; do + kill -9 $proc +done +set -e + +kill_server + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + cat $CLIENT_LOG + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + exit 1 +fi + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Restart test FAILED. \n***" +else + echo -e "\n***\n*** Restart test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_backend_python/setup_python_enviroment.sh b/qa/L0_backend_python/setup_python_enviroment.sh new file mode 100755 index 0000000000..a2171e02da --- /dev/null +++ b/qa/L0_backend_python/setup_python_enviroment.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +RET=0 +set -e +if [ ${PYTHON_ENV_VERSION} = "10" ]; then + echo No need to set up anything for default python3.${PYTHON_ENV_VERSION} + exit $RET +fi + +source common.sh +source ../common/util.sh + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +BASE_SERVER_ARGS="--model-repository=${MODELDIR}/models --log-verbose=1 --disable-auto-complete-config" +PYTHON_BACKEND_BRANCH=$PYTHON_BACKEND_REPO_TAG +SERVER_ARGS=$BASE_SERVER_ARGS +SERVER_LOG="./inference_server.log" +export PYTHON_ENV_VERSION=${PYTHON_ENV_VERSION:="10"} +RET=0 +EXPECTED_VERSION_STRINGS="" + +rm -fr ./models +rm -rf *.tar.gz +install_build_deps +install_conda + +# Test other python versions +conda update -n base -c defaults conda -y +# Create a model with python 3.8 version +# Successful execution of the Python model indicates that the environment has +# been setup correctly. +if [ ${PYTHON_ENV_VERSION} = "8" ]; then + create_conda_env "3.8" "python-3-8" + conda install -c conda-forge libstdcxx-ng=12 -y + conda install numpy=1.23.4 -y + conda install tensorflow=2.10.0 -y + EXPECTED_VERSION_STRING="Python version is 3.8, NumPy version is 1.23.4, and Tensorflow version is 2.10.0" + create_python_backend_stub + conda-pack -o python3.8.tar.gz + path_to_conda_pack="$PWD/python-3-8" + mkdir -p $path_to_conda_pack + tar -xzf python3.8.tar.gz -C $path_to_conda_pack + mkdir -p models/python_3_8/1/ + cp ../python_models/python_version/config.pbtxt ./models/python_3_8 + (cd models/python_3_8 && \ + sed -i "s/^name:.*/name: \"python_3_8\"/" config.pbtxt && \ + echo "parameters: {key: \"EXECUTION_ENV_PATH\", value: {string_value: \"$path_to_conda_pack\"}}">> config.pbtxt) + cp ../python_models/python_version/model.py ./models/python_3_8/1/ + cp python_backend/builddir/triton_python_backend_stub ./models/python_3_8 +fi + +# Create a model with python 3.9 version +# Successful execution of the Python model indicates that the environment has +# been setup correctly. +if [ ${PYTHON_ENV_VERSION} = "9" ]; then + create_conda_env "3.9" "python-3-9" + conda install -c conda-forge libstdcxx-ng=12 -y + conda install numpy=1.23.4 -y + conda install tensorflow=2.10.0 -y + EXPECTED_VERSION_STRING="Python version is 3.9, NumPy version is 1.23.4, and Tensorflow version is 2.10.0" + create_python_backend_stub + conda-pack -o python3.9.tar.gz + path_to_conda_pack="$PWD/python-3-9" + mkdir -p $path_to_conda_pack + tar -xzf python3.9.tar.gz -C $path_to_conda_pack + mkdir -p models/python_3_9/1/ + cp ../python_models/python_version/config.pbtxt ./models/python_3_9 + (cd models/python_3_9 && \ + sed -i "s/^name:.*/name: \"python_3_9\"/" config.pbtxt && \ + echo "parameters: {key: \"EXECUTION_ENV_PATH\", value: {string_value: \"$path_to_conda_pack\"}}">> config.pbtxt) + cp ../python_models/python_version/model.py ./models/python_3_9/1/ + cp python_backend/builddir/triton_python_backend_stub ./models/python_3_9 +fi + +# Create a model with python 3.11 version +# Successful execution of the Python model indicates that the environment has +# been setup correctly. +if [ ${PYTHON_ENV_VERSION} = "11" ]; then + create_conda_env "3.11" "python-3-11" + conda install tensorflow=2.12.0 -y + conda install -c conda-forge libstdcxx-ng=12 -y + conda install numpy=1.23.5 -y + EXPECTED_VERSION_STRING="Python version is 3.11, NumPy version is 1.23.5, and Tensorflow version is 2.12.0" + create_python_backend_stub + conda-pack -o python3.11.tar.gz + path_to_conda_pack="$PWD/python-3-11" + mkdir -p $path_to_conda_pack + tar -xzf python3.11.tar.gz -C $path_to_conda_pack + mkdir -p models/python_3_11/1/ + cp ../python_models/python_version/config.pbtxt ./models/python_3_11 + (cd models/python_3_11 && \ + sed -i "s/^name:.*/name: \"python_3_11\"/" config.pbtxt && \ + echo "parameters: {key: \"EXECUTION_ENV_PATH\", value: {string_value: \"$path_to_conda_pack\"}}">> config.pbtxt) + cp ../python_models/python_version/model.py ./models/python_3_11/1/ + cp python_backend/builddir/triton_python_backend_stub ./models/python_3_11 +fi +conda deactivate +rm -rf ./miniconda + +# test that +set +e +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +kill_server + +grep "$EXPECTED_VERSION_STRING" $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** $EXPECTED_VERSION_STRING was not found in Triton logs. \n***" + RET=1 +fi +set -e + +echo "python environment 3.${PYTHON_ENV_VERSION}" +# copy the stub out to /opt/tritonserver/backends/python/triton_python_backend_stub +cp python_backend/builddir/triton_python_backend_stub /opt/tritonserver/backends/python/triton_python_backend_stub +# Set up environment and stub for each test +add-apt-repository ppa:deadsnakes/ppa -y +apt-get update && apt-get -y install \ + "python3.${PYTHON_ENV_VERSION}-dev" \ + "python3.${PYTHON_ENV_VERSION}-distutils" \ + libboost-dev +rm -f /usr/bin/python3 && \ +ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3 +pip3 install --upgrade requests numpy virtualenv protobuf +find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \ + "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \ + xargs pip3 install --upgrade + +# Build triton-shm-monitor for the test +cd python_backend && rm -rf install build && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX:PATH=$PWD/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \ + -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} .. && \ + make -j16 triton-shm-monitor install +cp $PWD/install/backends/python/triton_shm_monitor.cpython-* /opt/tritonserver/qa/common/. +set +e +exit $RET diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh new file mode 100755 index 0000000000..324ee5ba1f --- /dev/null +++ b/qa/L0_backend_python/test.sh @@ -0,0 +1,501 @@ +#!/bin/bash +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi + +# On windows the paths invoked by the script (running in WSL) must use +# /mnt/c when needed but the paths on the tritonserver command-line +# must be C:/ style. +export TEST_WINDOWS=0 +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then + export DATADIR=${DATADIR:="/c/data/inferenceserver/${REPO_VERSION}"} + export TRITON_DIR=${TRITON_DIR:=c:/tritonserver} + # This will run in WSL, but Triton will run in windows, so environment + # variables meant for loaded models must be exported using WSLENV. + # The /w flag indicates the value should only be included when invoking + # Win32 from WSL. + export WSLENV=TRITON_DIR + export SERVER=${SERVER:=c:/tritonserver/bin/tritonserver.exe} + export BACKEND_DIR=${BACKEND_DIR:=c:/tritonserver/backends} + export MODELDIR=${MODELDIR:=c:/} + export TEST_WINDOWS=1 +else + export DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + export TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} + export SERVER=${TRITON_DIR}/bin/tritonserver + export BACKEND_DIR=${TRITON_DIR}/backends + export MODELDIR=${MODELDIR:=`pwd`} +fi +export REPO_VERSION=$REPO_VERSION +export TEST_JETSON=${TEST_JETSON:=0} +export CUDA_VISIBLE_DEVICES=0 +export PYTHON_ENV_VERSION=${PYTHON_ENV_VERSION:="10"} +export PYTHON_BACKEND_REPO_TAG=$PYTHON_BACKEND_REPO_TAG + +BASE_SERVER_ARGS="--model-repository=${MODELDIR}/models --backend-directory=${BACKEND_DIR} --log-verbose=1" +# Set the default byte size to 5MBs to avoid going out of shared memory. The +# environment that this job runs on has only 1GB of shared-memory available. +SERVER_ARGS="$BASE_SERVER_ARGS --backend-config=python,shm-default-byte-size=5242880" + +CLIENT_PY=./python_test.py +CLIENT_LOG="./client.log" +TEST_RESULT_FILE='test_results.txt' +SERVER_LOG="./inference_server.log" +source ../common/util.sh +source ./common.sh + +rm -fr *.log ./models + +python3 --version | grep "3.10" > /dev/null +if [ $? -ne 0 ]; then + echo -e "Expecting Python default version to be: Python 3.10 but actual version is $(python3 --version)" + exit 1 +fi + +(bash -ex setup_python_enviroment.sh) + +python3 --version | grep "3.${PYTHON_ENV_VERSION}" > /dev/null +if [ $? -ne 0 ]; then + echo -e "Expecting Python version to be: Python 3.${PYTHON_ENV_VERSION} but actual version is $(python3 --version)" + exit 1 +fi + +mkdir -p models/identity_fp32/1/ +cp ../python_models/identity_fp32/model.py ./models/identity_fp32/1/model.py +cp ../python_models/identity_fp32/config.pbtxt ./models/identity_fp32/config.pbtxt +mkdir -p models/identity_bf16/1/ +cp ../python_models/identity_bf16/model.py ./models/identity_bf16/1/model.py +cp ../python_models/identity_bf16/config.pbtxt ./models/identity_bf16/config.pbtxt +RET=0 + +cp -r ./models/identity_fp32 ./models/identity_uint8 +(cd models/identity_uint8 && \ + sed -i "s/^name:.*/name: \"identity_uint8\"/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_UINT8/g" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt) + +cp -r ./models/identity_fp32 ./models/identity_uint8_nobatch +(cd models/identity_uint8_nobatch && \ + sed -i "s/^name:.*/name: \"identity_uint8_nobatch\"/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_UINT8/g" config.pbtxt && \ + sed -i "s/^max_batch_size:.*//" config.pbtxt >> config.pbtxt) + +cp -r ./models/identity_fp32 ./models/identity_uint32 +(cd models/identity_uint32 && \ + sed -i "s/^name:.*/name: \"identity_uint32\"/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_UINT32/g" config.pbtxt) + +cp -r ./models/identity_fp32 ./models/identity_bool +(cd models/identity_bool && \ + sed -i "s/^name:.*/name: \"identity_bool\"/" config.pbtxt && \ + sed -i "s/TYPE_FP32/TYPE_BOOL/g" config.pbtxt) + +# Test models with `default_model_filename` variable set. +cp -r ./models/identity_fp32 ./models/default_model_name +mv ./models/default_model_name/1/model.py ./models/default_model_name/1/mymodel.py +(cd models/default_model_name && \ + sed -i "s/^name:.*/name: \"default_model_name\"/" config.pbtxt && \ + echo "default_model_filename: \"mymodel.py\"" >> config.pbtxt ) + +mkdir -p models/pytorch_fp32_fp32/1/ + cp -r ../python_models/pytorch_fp32_fp32/model.py ./models/pytorch_fp32_fp32/1/ + cp ../python_models/pytorch_fp32_fp32/config.pbtxt ./models/pytorch_fp32_fp32/ + (cd models/pytorch_fp32_fp32 && \ + sed -i "s/^name:.*/name: \"pytorch_fp32_fp32\"/" config.pbtxt) + +mkdir -p models/delayed_model/1/ +cp -r ../python_models/delayed_model/model.py ./models/delayed_model/1/ +cp ../python_models/delayed_model/config.pbtxt ./models/delayed_model/ +mkdir -p models/init_args/1/ +cp ../python_models/init_args/model.py ./models/init_args/1/ +cp ../python_models/init_args/config.pbtxt ./models/init_args/ +sed -i "s|TRITON_DIR_PATH|${TRITON_DIR}|" ./models/init_args/config.pbtxt + + +mkdir -p models/optional/1/ +cp ../python_models/optional/model.py ./models/optional/1/ +cp ../python_models/optional/config.pbtxt ./models/optional/ + +mkdir -p models/non_contiguous/1/ +cp ../python_models/non_contiguous/model.py ./models/non_contiguous/1/ +cp ../python_models/non_contiguous/config.pbtxt ./models/non_contiguous/config.pbtxt + +# Unicode Characters +mkdir -p models/string/1/ +cp ../python_models/string/model.py ./models/string/1/ +cp ../python_models/string/config.pbtxt ./models/string + +# More string tests +mkdir -p models/string_fixed/1/ +cp ../python_models/string_fixed/model.py ./models/string_fixed/1/ +cp ../python_models/string_fixed/config.pbtxt ./models/string_fixed + +mkdir -p models/dlpack_identity/1/ +cp ../python_models/dlpack_identity/model.py ./models/dlpack_identity/1/ +cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity + + +if [ "$TEST_JETSON" == "0" ] && [[ ${TEST_WINDOWS} == 0 ]]; then + pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html +else + # GPU tensor tests are disabled on jetson + pip3 install torch==1.13.0 -f https://download.pytorch.org/whl/torch_stable.html +fi + +pip3 install pytest requests virtualenv + +prev_num_pages=`get_shm_pages` +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +set +e +python3 -m pytest --junitxml=L0_backend_python.report.xml $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 +fi +set -e + +kill_server + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + ls /dev/shm + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + RET=1 +fi + +prev_num_pages=`get_shm_pages` +# Triton non-graceful exit +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +sleep 5 + +readarray -t triton_procs < <(pgrep --parent ${SERVER_PID}) + +set +e + +# Trigger non-graceful termination of Triton +kill -9 $SERVER_PID + +# Wait 10 seconds so that Python stub can detect non-graceful exit +sleep 10 + +for triton_proc in $triton_procs; do + kill -0 $triton_proc > /dev/null 2>&1 + if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Python backend non-graceful exit test failed \n***" + RET=1 + break + fi +done +set -e + +# +# Test KIND_GPU +# Disable env test for Jetson & Windows since GPU Tensors are not supported +if [ "$TEST_JETSON" == "0" ] && [[ ${TEST_WINDOWS} == 0 ]]; then + rm -rf models/ + mkdir -p models/add_sub_gpu/1/ + cp ../python_models/add_sub/model.py ./models/add_sub_gpu/1/ + cp ../python_models/add_sub_gpu/config.pbtxt ./models/add_sub_gpu/ + + prev_num_pages=`get_shm_pages` + run_server + if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 + fi + + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** KIND_GPU model test failed \n***" + RET=1 + fi + + kill_server + + current_num_pages=`get_shm_pages` + if [ $current_num_pages -ne $prev_num_pages ]; then + cat $CLIENT_LOG + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. + Shared memory pages before starting triton equals to $prev_num_pages + and shared memory pages after starting triton equals to $current_num_pages \n***" + exit 1 + fi +fi + +# Test Multi file models +rm -rf models/ +mkdir -p models/multi_file/1/ +cp ../python_models/multi_file/*.py ./models/multi_file/1/ +cp ../python_models/identity_fp32/config.pbtxt ./models/multi_file/ +(cd models/multi_file && \ + sed -i "s/^name:.*/name: \"multi_file\"/" config.pbtxt) + +prev_num_pages=`get_shm_pages` +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** multi-file model test failed \n***" + RET=1 +fi + +kill_server + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + cat $SERVER_LOG + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + exit 1 +fi + +# Test environment variable propagation +rm -rf models/ +mkdir -p models/model_env/1/ +cp ../python_models/model_env/model.py ./models/model_env/1/ +cp ../python_models/model_env/config.pbtxt ./models/model_env/ + +export MY_ENV="MY_ENV" +if [[ ${TEST_WINDOWS} == 1 ]]; then + # This will run in WSL, but Triton will run in windows, so environment + # variables meant for loaded models must be exported using WSLENV. + # The /w flag indicates the value should only be included when invoking + # Win32 from WSL. + export WSLENV=MY_ENV/w +fi + +prev_num_pages=`get_shm_pages` +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + echo -e "\n***\n*** Environment variable test failed \n***" + exit 1 +fi + +kill_server + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + cat $CLIENT_LOG + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + exit 1 +fi + +rm -fr ./models +mkdir -p models/identity_fp32/1/ +cp ../python_models/identity_fp32/model.py ./models/identity_fp32/1/model.py +cp ../python_models/identity_fp32/config.pbtxt ./models/identity_fp32/config.pbtxt + +shm_default_byte_size=$((1024*1024*4)) +SERVER_ARGS="$BASE_SERVER_ARGS --backend-config=python,shm-default-byte-size=$shm_default_byte_size" + +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 +fi + +for shm_page in `ls /dev/shm/`; do + if [[ $shm_page != triton_python_backend_shm* ]]; then + continue + fi + page_size=`ls -l /dev/shm/$shm_page 2>&1 | awk '{print $5}'` + if [ $page_size -ne $shm_default_byte_size ]; then + echo -e "Shared memory region size is not equal to +$shm_default_byte_size for page $shm_page. Region size is +$page_size." + RET=1 + fi +done + +kill_server + +# Test model getting killed during initialization +rm -fr ./models +mkdir -p models/init_exit/1/ +cp ../python_models/init_exit/model.py ./models/init_exit/1/model.py +cp ../python_models/init_exit/config.pbtxt ./models/init_exit/config.pbtxt + +ERROR_MESSAGE="Stub process 'init_exit_0_0' is not healthy." + +prev_num_pages=`get_shm_pages` +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + kill_server +else + if grep "$ERROR_MESSAGE" $SERVER_LOG; then + echo -e "Found \"$ERROR_MESSAGE\"" >> $CLIENT_LOG + else + echo $CLIENT_LOG + echo -e "Not found \"$ERROR_MESSAGE\"" >> $CLIENT_LOG + RET=1 + fi +fi + +current_num_pages=`get_shm_pages` +if [ $current_num_pages -ne $prev_num_pages ]; then + cat $SERVER_LOG + ls /dev/shm + echo -e "\n***\n*** Test Failed. Shared memory pages where not cleaned properly. +Shared memory pages before starting triton equals to $prev_num_pages +and shared memory pages after starting triton equals to $current_num_pages \n***" + exit 1 +fi + +# Disable env test for Jetson since cloud storage repos are not supported +# Disable ensemble, io and bls tests for Jetson since GPU Tensors are not supported +# Disable variants test for Jetson since already built without GPU Tensor support +# Disable decoupled test because it uses GPU tensors +if [ "$TEST_JETSON" == "0" ]; then + SUBTESTS="ensemble bls decoupled response_sender" + # [DLIS-6093] Disable variants test for Windows since tests are not executed in docker container (cannot apt update/install) + # [DLIS-5970] Disable io tests for Windows since GPU Tensors are not supported + # [DLIS-6122] Disable model_control & request_rescheduling tests for Windows since they require load/unload + if [[ ${TEST_WINDOWS} == 0 ]]; then + SUBTESTS+=" variants io python_based_backends async_execute" + fi + + for TEST in ${SUBTESTS}; do + # Run each subtest in a separate virtual environment to avoid conflicts + # between dependencies. + setup_virtualenv + + set +e + (cd ${TEST} && bash -ex test.sh) + EXIT_CODE=$? + if [ $EXIT_CODE -ne 0 ]; then + echo "Subtest ${TEST} FAILED" + RET=$EXIT_CODE + + # In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'. + # Propagate the exit code to make sure it's not overwritten by other tests. + if [[ ${TEST} == "bls" ]] && [[ $EXIT_CODE -ne 1 ]] ; then + BLS_RET=$RET + fi + fi + set -e + + deactivate_virtualenv + done + + # [DLIS-5969]: Incorporate env test for windows + if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then + # In 'env' test we use miniconda for dependency management. No need to run + # the test in a virtual environment. + set +e + (cd env && bash -ex test.sh) + if [ $? -ne 0 ]; then + echo "Subtest env FAILED" + RET=1 + fi + set -e + fi +fi + +SUBTESTS="lifecycle argument_validation logging custom_metrics" +# [DLIS-6124] Disable restart test for Windows since it requires more investigation +# [DLIS-6122] Disable model_control & request_rescheduling tests for Windows since they require load/unload +# [DLIS-6123] Disable examples test for Windows since it requires updates to the example clients +if [[ ${TEST_WINDOWS} == 0 ]]; then + # TODO: Reimplement restart on decoupled data pipeline and enable restart. + SUBTESTS+=" model_control examples request_rescheduling" +fi +for TEST in ${SUBTESTS}; do + # Run each subtest in a separate virtual environment to avoid conflicts + # between dependencies. + setup_virtualenv + + set +e + (cd ${TEST} && bash -ex test.sh) + + if [ $? -ne 0 ]; then + echo "Subtest ${TEST} FAILED" + RET=1 + fi + set -e + + deactivate_virtualenv +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +# Exit with RET if it is 1, meaning that the test failed. +# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured. +if [ $RET -eq 1 ]; then + exit $RET +else + if [ -z "$BLS_RET" ]; then + exit $RET + else + exit $BLS_RET + fi +fi diff --git a/qa/L0_backend_python/test_infer_shm_leak.py b/qa/L0_backend_python/test_infer_shm_leak.py new file mode 100755 index 0000000000..966243e86e --- /dev/null +++ b/qa/L0_backend_python/test_infer_shm_leak.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../../common") + +import os +import unittest + +import pytest +import shm_util +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +# By default, find tritonserver on "localhost", but for windows tests +# we overwrite the IP address with the TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + +# The exit code 123 is used to indicate that the shm leak probe detected a 480 +# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the +# test to fail with the default exit code 1. +ALLOWED_FAILURE_EXIT_CODE = 123 + + +class TestInferShmLeak: + def _run_unittest(self, model_name): + with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client: + # No input is required + result = client.infer(model_name, [], client_timeout=240) + output0 = result.as_numpy("OUTPUT0") + + # The model returns 1 if the tests were successfully passed. + # Otherwise, it will return 0. + assert output0 == [1], f"python_unittest failed for model {model_name}" + + def test_shm_leak(self): + self._shm_leak_detector = shm_util.ShmLeakDetector() + model_name = os.environ.get("MODEL_NAME", "default_model") + + try: + with self._shm_leak_detector.Probe() as shm_probe: + self._run_unittest(model_name) + except AssertionError as e: + if "Known shared memory leak of 480 bytes detected" in str(e): + pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE) + else: + raise e diff --git a/qa/L0_backend_python/variants/test.sh b/qa/L0_backend_python/variants/test.sh new file mode 100755 index 0000000000..86cc793a94 --- /dev/null +++ b/qa/L0_backend_python/variants/test.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Building a CPU build of Python backend +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server} + +source ../common.sh +install_build_deps +rm -rf python_backend + +git clone ${TRITON_REPO_ORGANIZATION}/python_backend -b $PYTHON_BACKEND_REPO_TAG +(cd python_backend/ && mkdir builddir && cd builddir && \ + cmake -DTRITON_ENABLE_GPU=OFF -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} -DTRITON_BACKEND_REPO_TAG=$TRITON_BACKEND_REPO_TAG -DTRITON_COMMON_REPO_TAG=$TRITON_COMMON_REPO_TAG -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG ../ && \ + make -j18 install) + +if [ $? == 0 ]; then + echo -e "\n***\n*** No CPU build test PASSED.\n***" +else + echo -e "\n***\n*** No CPU build test FAILED.\n***" +fi + diff --git a/qa/L0_backend_release/test.sh b/qa/L0_backend_release/test.sh new file mode 100755 index 0000000000..def25499ab --- /dev/null +++ b/qa/L0_backend_release/test.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +SIMPLE_CLIENT=../clients/simple_http_infer_client +SIMPLE_SEQ_CLIENT=../clients/simple_grpc_sequence_stream_infer_client + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +source ../common/util.sh + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +rm -fr *.log + +# This is a test of the schedulers to make sure they correctly release +# their own backend so don't need to test across all frameworks. Set +# the delay, in milliseconds, that will cause the scheduler to be the +# last holding the backend handle. +export TRITONSERVER_DELAY_SCHEDULER_BACKEND_RELEASE=5000 + +# dynamic batcher - 1 instance +rm -fr models && cp -r simple_models models +(cd models/simple && echo "instance_group [{ count: 1 }]" >> config.pbtxt) + +SERVER_LOG="./inference_server_1.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +$SIMPLE_CLIENT -v >> client_simple.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# dynamic batcher - 4 instance +rm -fr models && cp -r simple_models models +(cd models/simple && echo "instance_group [{ count: 4 }]" >> config.pbtxt) + +SERVER_LOG="./inference_server_4.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +$SIMPLE_CLIENT -v >> client_simple.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# sequence batcher - 1 instance +rm -fr models && cp -r simple_seq_models models +(cd models/simple_sequence && \ + sed -i "s/sequence_batching.*{.*/sequence_batching { max_sequence_idle_microseconds: 10000000/" \ + config.pbtxt) + +SERVER_LOG="./inference_server_seq_1.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +$SIMPLE_SEQ_CLIENT -v >> client_simple_seq.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# sequence batcher - 4 instance +rm -fr models && cp -r simple_seq_models models +(cd models/simple_sequence && \ + echo "instance_group [{ count: 3 }]" >> config.pbtxt && \ + sed -i "s/sequence_batching.*{.*/sequence_batching { max_sequence_idle_microseconds: 10000000/" \ + config.pbtxt) + +SERVER_LOG="./inference_server_seq_4.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +$SIMPLE_SEQ_CLIENT -v >> client_simple_seq.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_backend_tutorial/test.sh b/qa/L0_backend_tutorial/test.sh new file mode 100755 index 0000000000..52319f90ba --- /dev/null +++ b/qa/L0_backend_tutorial/test.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +TRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG:="main"} +TRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG:="main"} +TRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG:="main"} + +MINIMAL_LOG="./minimal.log" +RECOMMENDED_LOG="./recommended.log" + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 + +# Client build requires recent version of CMake (FetchContent required) +# Using CMAKE installation instruction from:: https://apt.kitware.com/ +apt update -q=2 \ + && apt install -y gpg wget \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && . /etc/os-release \ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && apt-get update -q=2 \ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* \ + rapidjson-dev +cmake --version + +rm -fr *.log ./backend +git clone --single-branch --depth=1 -b $TRITON_BACKEND_REPO_TAG \ + ${TRITON_REPO_ORGANIZATION}/backend.git + +# +# Minimal backend +# +(cd backend/examples/backends/minimal && + mkdir build && + cd build && + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + .. && + make -j4 install) + +rm -fr /opt/tritonserver/backends/minimal +cp -r backend/examples/backends/minimal/build/install/backends/minimal /opt/tritonserver/backends/. + +SERVER_LOG="./minimal_server.log" +SERVER_ARGS="--model-repository=`pwd`/backend/examples/model_repos/minimal_models" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +backend/examples/clients/minimal_client >> ${MINIMAL_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat $MINIMAL_LOG + RET=1 +fi + +grep "OUT0 = \[1 2 3 4\]" $MINIMAL_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify minimal nonbatching example. \n***" + cat $MINIMAL_LOG + RET=1 +fi + +grep "OUT0 = \[\[10 11 12 13\]\]" $MINIMAL_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify minimal batching example. \n***" + cat $MINIMAL_LOG + RET=1 +fi + +grep "OUT0 = \[\[20 21 22 23\]\]" $MINIMAL_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify minimal batching example. \n***" + cat $MINIMAL_LOG + RET=1 +fi + +grep "model batching: requests in batch 2" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify minimal server log. \n***" + cat $SERVER_LOG + cat $MINIMAL_LOG + RET=1 +fi + +grep "batched IN0 value: \[ 10, 11, 12, 13, 20, 21, 22, 23 \]" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify minimal server log. \n***" + cat $SERVER_LOG + cat $MINIMAL_LOG + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +rm -fr /opt/tritonserver/backends/minimal + +# +# Recommended backend +# +(cd backend/examples/backends/recommended && + mkdir build && + cd build && + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + .. && + make -j4 install) + +rm -fr /opt/tritonserver/backends/recommended +cp -r backend/examples/backends/recommended/build/install/backends/recommended /opt/tritonserver/backends/. + +SERVER_LOG="./recommended_server.log" +SERVER_ARGS="--model-repository=`pwd`/backend/examples/model_repos/recommended_models" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +backend/examples/clients/recommended_client >> ${RECOMMENDED_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat $RECOMMENDED_LOG + RET=1 +fi + +grep -z "OUTPUT = \[\[\[1. 1.1 1.2 1.3\].*\[2. 2.1 2.2 2.3\].*\[3. 3.1 3.2 3.3\].*\[4. 4.1 4.2 4.3\]\]\]" $RECOMMENDED_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify recommended example. \n***" + cat $RECOMMENDED_LOG + RET=1 +fi + +grep -z "OUTPUT = \[\[\[10. 10.1 10.2 10.3\].*\[20. 20.1 20.2 20.3\].*\[30. 30.1 30.2 30.3\].*\[40. 40.1 40.2 40.3\]\]\]" $RECOMMENDED_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify recommended example. \n***" + cat $RECOMMENDED_LOG + RET=1 +fi + +grep "model batching: requests in batch 2" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to verify recommended server log. \n***" + cat $SERVER_LOG + cat $RECOMMENDED_LOG + RET=1 +fi + +FOUND_MATCH=0 +grep "batched INPUT value: \[ 1.000000, 1.100000, 1.200000, 1.300000, 2.000000, 2.100000, 2.200000, 2.300000, 3.000000, 3.100000, 3.200000, 3.300000, 4.000000, 4.100000, 4.200000, 4.300000, 10.000000, 10.100000, 10.200000, 10.300000, 20.000000, 20.100000, 20.200001, 20.299999, 30.000000, 30.100000, 30.200001, 30.299999, 40.000000, 40.099998, 40.200001, 40.299999 \]" $SERVER_LOG +if [ $? -ne 0 ]; then + FOUND_MATCH=1 +fi +grep "batched INPUT value: \[ 10.000000, 10.100000, 10.200000, 10.300000, 20.000000, 20.100000, 20.200001, 20.299999, 30.000000, 30.100000, 30.200001, 30.299999, 40.000000, 40.099998, 40.200001, 40.299999, 1.000000, 1.100000, 1.200000, 1.300000, 2.000000, 2.100000, 2.200000, 2.300000, 3.000000, 3.100000, 3.200000, 3.300000, 4.000000, 4.100000, 4.200000, 4.300000 \]" $SERVER_LOG +if [ $? -ne 0 ]; then + FOUND_MATCH=1 +fi +if [ $FOUND_MATCH -eq 0 ]; then + echo -e "\n***\n*** Failed to verify recommended server log. \n***" + cat $SERVER_LOG + cat $RECOMMENDED_LOG + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +rm -fr /opt/tritonserver/backends/recommended + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_batch_custom/batch_custom_test.py b/qa/L0_batch_custom/batch_custom_test.py new file mode 100755 index 0000000000..6cd6346ad3 --- /dev/null +++ b/qa/L0_batch_custom/batch_custom_test.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import threading +import time +import unittest +from builtins import range +from collections.abc import Iterable + +import infer_util as iu +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient + +# By default, find tritonserver on "localhost", but can be overridden +# with TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + +_deferred_exceptions_lock = threading.Lock() +_deferred_exceptions = [] + + +class BatcherTest(tu.TestResultCollector): + def setUp(self): + # The helper client for setup will be GRPC for simplicity. + self.triton_client_ = grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) + self.precreated_shm_regions_ = [] + global _deferred_exceptions + _deferred_exceptions = [] + + def tearDown(self): + super().tearDown() + + def add_deferred_exception(self, ex): + global _deferred_exceptions + with _deferred_exceptions_lock: + _deferred_exceptions.append(ex) + + def check_deferred_exception(self): + # Just raise one of the exceptions... + with _deferred_exceptions_lock: + if len(_deferred_exceptions) > 0: + raise _deferred_exceptions[0] + + def check_response( + self, + trial, + bs, + thresholds, + requested_outputs=("OUTPUT0", "OUTPUT1"), + input_size=16, + shm_region_names=None, + precreated_shm_regions=None, + ): + try: + start_ms = int(round(time.time() * 1000)) + + if ( + trial == "savedmodel" + or trial == "graphdef" + or trial == "libtorch" + or trial == "onnx" + or trial == "plan" + or trial == "python" + ): + tensor_shape = (bs, input_size) + iu.infer_exact( + self, + trial, + tensor_shape, + bs, + np.float32, + np.float32, + np.float32, + swap=False, + model_version=1, + outputs=requested_outputs, + use_http=False, + use_grpc=False, + use_http_json_tensors=False, + skip_request_id_check=True, + use_streaming=False, + ) + else: + self.assertFalse(True, "unknown trial type: " + trial) + + end_ms = int(round(time.time() * 1000)) + + lt_ms = thresholds[0] + gt_ms = thresholds[1] + if lt_ms is not None: + self.assertTrue( + (end_ms - start_ms) < lt_ms, + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + if gt_ms is not None: + self.assertTrue( + (end_ms - start_ms) > gt_ms, + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + except Exception as ex: + self.add_deferred_exception(ex) + + def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, exec_count): + # There is a time window between when responses are returned and statistics are updated. + # To prevent intermittent test failure during that window, wait up to 10 seconds for the + # inference statistics to be ready. + num_tries = 10 + for i in range(num_tries): + stats = self.triton_client_.get_inference_statistics(model_name, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + actual_exec_cnt = stats.model_stats[0].execution_count + if actual_exec_cnt == exec_count: + break + print( + "WARNING: expect {} executions, got {} (attempt {})".format( + exec_count, actual_exec_cnt, i + ) + ) + time.sleep(1) + + self.assertEqual( + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) + + if batch_exec: + batch_stats = stats.model_stats[0].batch_stats + self.assertEqual( + len(batch_stats), + len(batch_exec), + "expected {} different batch-sizes, got {}".format( + len(batch_exec), len(batch_stats) + ), + ) + + for batch_stat in batch_stats: + bs = batch_stat.batch_size + bc = batch_stat.compute_infer.count + self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs)) + # Get count from one of the stats + self.assertEqual( + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) + + actual_request_cnt = stats.model_stats[0].inference_stats.success.count + self.assertEqual( + actual_request_cnt, + request_cnt, + "expected model-request-count {}, got {}".format( + request_cnt, actual_request_cnt + ), + ) + + actual_exec_cnt = stats.model_stats[0].execution_count + if isinstance(exec_count, Iterable): + self.assertIn( + actual_exec_cnt, + exec_count, + "expected model-exec-count {}, got {}".format( + exec_count, actual_exec_cnt + ), + ) + else: + self.assertEqual( + actual_exec_cnt, + exec_count, + "expected model-exec-count {}, got {}".format( + exec_count, actual_exec_cnt + ), + ) + actual_infer_cnt = stats.model_stats[0].inference_count + self.assertEqual( + actual_infer_cnt, + infer_cnt, + "expected model-inference-count {}, got {}".format( + infer_cnt, actual_infer_cnt + ), + ) + + def test_volume_batching(self): + # Send 12 requests with batch size 1. The max_queue_delay is set + # to non-zero. Depending upon the timing of the requests arrival + # there can be either 4-6 model executions. + model_base = "onnx" + dtype = np.float16 + shapes = ( + [ + 1, + 4, + 4, + ], + ) + + try: + # use threads to send 12 requests without waiting for response + threads = [] + for i in range(12): + threads.append( + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_http": True, + "use_grpc": False, + "use_http_json_tensors": False, + "use_streaming": False, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + model_name = tu.get_zero_model_name(model_base, len(shapes), dtype) + self.check_status(model_name, None, 12, 12, (4, 5, 6)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_batch_custom/test.sh b/qa/L0_batch_custom/test.sh new file mode 100755 index 0000000000..96dba468f8 --- /dev/null +++ b/qa/L0_batch_custom/test.sh @@ -0,0 +1,196 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +## This test tests the ability to use custom batching strategies with models. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +BATCH_CUSTOM_TEST=batch_custom_test.py +CLIENT_LOG_BASE="./client.log" +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository +EXPECTED_NUM_TESTS="1" +MODEL_NAME="onnx_zero_1_float16" +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --log-verbose 1" +SERVER_LOG_BASE="./inference_server.log" +TEST_RESULT_FILE='test_results.txt' +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +TRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG:="main"} +TRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG:="main"} + +source ../common/util.sh +RET=0 + +# Batch strategy build requires recent version of CMake (FetchContent required) +# Using CMAKE installation instruction from:: https://apt.kitware.com/ +apt update -q=2 \ + && apt install -y gpg wget \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && . /etc/os-release \ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && apt-get update -q=2 \ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* rapidjson-dev +cmake --version + +# Set up repository +rm -fr *.log* ./backend +rm -fr models && mkdir models +cp -r $DATADIR/$MODEL_NAME models + +CONFIG_PATH="models/${MODEL_NAME}/config.pbtxt" +echo "dynamic_batching { max_queue_delay_microseconds: 10000}" >> ${CONFIG_PATH} +echo "instance_group [ { kind: KIND_GPU count: 2 }]" >> ${CONFIG_PATH} +echo "parameters { key: \"MAX_BATCH_VOLUME_BYTES\" value: {string_value: \"96\"}}" >> ${CONFIG_PATH} + +# Create custom batching libraries +git clone --single-branch --depth=1 -b $TRITON_BACKEND_REPO_TAG \ + ${TRITON_REPO_ORGANIZATION}/backend.git + +(cd backend/examples/batching_strategies/volume_batching && + mkdir build && + cd build && + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG .. && + make -j4 install) + + (cd backend/examples/batching_strategies/single_batching && + mkdir build && + cd build && + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG .. && + make -j4 install) + +cp -r backend/examples/batching_strategies/volume_batching/build/libtriton_volumebatching.so models +cp -r backend/examples/batching_strategies/single_batching/build/libtriton_singlebatching.so models + +# Run a test to validate the single batching strategy example. +# Then, run tests to validate the volume batching example being passed in via the backend dir, model dir, version dir, and model config. +BACKEND_DIR="/opt/tritonserver/backends/onnxruntime" +MODEL_DIR="models/$MODEL_NAME" +VERSION_DIR="$MODEL_DIR/1/" + +test_types=('single_batching_backend' 'backend_directory' 'model_directory' 'version_directory' 'model_config') +test_setups=("cp models/libtriton_singlebatching.so ${BACKEND_DIR}/batchstrategy.so && sed -i \"s/(4, 5, 6))/(12))/\" ${BATCH_CUSTOM_TEST}" + "cp models/libtriton_volumebatching.so ${BACKEND_DIR}/batchstrategy.so && sed -i \"s/(12))/(4, 5, 6))/\" ${BATCH_CUSTOM_TEST}" + "mv ${BACKEND_DIR}/batchstrategy.so ${MODEL_DIR} && cp models/libtriton_singlebatching.so ${BACKEND_DIR}" + "mv ${MODEL_DIR}/batchstrategy.so ${VERSION_DIR}/batchstrategy.so" + "mv ${VERSION_DIR}/batchstrategy.so models/${MODEL_NAME}/libtriton_volumebatching.so && echo \"parameters: {key: \\\"TRITON_BATCH_STRATEGY_PATH\\\", value: {string_value: \\\"${MODEL_DIR}/libtriton_volumebatching.so\\\"}}\" >> ${CONFIG_PATH}") + +for i in "${!test_setups[@]}"; do + echo "Running ${test_types[$i]} test" + eval ${test_setups[$i]} + + SERVER_LOG=${SERVER_LOG_BASE}_${test_types[$i]} + CLIENT_LOG=${CLIENT_LOG_BASE}_${test_types[$i]} + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + if [ `grep -c "Loading custom batching strategy" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to load custom batching strategy.***" + RET=1 + else + set +e + python $BATCH_CUSTOM_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** ${test_types[$i]} Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** ${test_types[$i]} Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + fi + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test ModelBatchInitialize failure +FILE_PATH="backend/examples/batching_strategies/volume_batching/src/volume_batching.cc" +OLD_STRING="\/\/ Batcher will point to an unsigned integer representing the maximum" +NEW_STRING="return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,\"Failure test case\");" + +sed -i "s/${OLD_STRING}/${NEW_STRING}/g" ${FILE_PATH} + +(cd backend/examples/batching_strategies/volume_batching && + cd build && + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_CORE_REPO_TAG=$TRITON_CORE_REPO_TAG .. && + make -j4 install) + +cp -r backend/examples/batching_strategies/volume_batching/build/libtriton_volumebatching.so models/${MODEL_NAME}/libtriton_volumebatching.so + +SERVER_LOG=${SERVER_LOG_BASE}_batching_init_failure + +run_server +if [ "$SERVER_PID" != "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** ModelBatchInit Error Test: unexpected successful server start $SERVER\n***" + kill_server + RET=1 +else + if [ `grep -c "Failure test case" $SERVER_LOG` -lt 1 ] || [ `grep -c "Not found" $SERVER_LOG` -lt 1 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** ModelBatchInit Error Test: failed to find \"Failure test case\" message and/or \"Not found\" error type" + RET=1 + fi +fi + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_batch_input/batch_input_test.py b/qa/L0_batch_input/batch_input_test.py new file mode 100755 index 0000000000..02de27d921 --- /dev/null +++ b/qa/L0_batch_input/batch_input_test.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import queue +import unittest +from functools import partial + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class BatchInputTest(tu.TestResultCollector): + def setUp(self): + self.client = grpcclient.InferenceServerClient(url="localhost:8001") + + def callback(user_data, result, error): + if error: + user_data.put(error) + else: + user_data.put(result) + + self.client_callback = callback + + def set_inputs(self, shapes, input_name): + self.dtype_ = np.float32 + self.inputs = [] + for shape in shapes: + self.inputs.append( + [grpcclient.InferInput(input_name, [1, shape[0]], "FP32")] + ) + self.inputs[-1][0].set_data_from_numpy( + np.full([1, shape[0]], shape[0], np.float32) + ) + + def set_inputs_for_batch_item(self, shapes, input_name): + self.dtype_ = np.float32 + self.inputs = [] + for shape in shapes: + self.inputs.append([grpcclient.InferInput(input_name, shape, "FP32")]) + self.inputs[-1][0].set_data_from_numpy(np.full(shape, shape[0], np.float32)) + + def test_ragged_output(self): + model_name = "ragged_io" + # The model is an identity model + self.set_inputs([[2], [4], [1], [3]], "INPUT0") + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "OUTPUT0" + outputs = [grpcclient.InferRequestedOutput(output_name)] + + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + expected_value_list = [[v] * v for v in [2, 4, 1, 3]] + expected_value_list = [ + np.asarray([expected_value], dtype=np.float32) + for expected_value in expected_value_list + ] + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertTrue( + np.array_equal(output_data, expected_value_list[idx]), + "Expect response {} to have value {}, got {}".format( + idx, expected_value_list[idx], output_data + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + def test_ragged_input(self): + model_name = "ragged_acc_shape" + self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "RAGGED_OUTPUT" + outputs = [grpcclient.InferRequestedOutput(output_name)] + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + value_lists = [[v] * v for v in [2, 4, 1, 3]] + expected_value = [] + for value_list in value_lists: + expected_value += value_list + expected_value = np.asarray([expected_value], dtype=np.float32) + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertTrue( + np.array_equal(output_data, expected_value), + "Expect response {} to have value {}, got {}".format( + idx, expected_value, output_data + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + def test_element_count(self): + model_name = "ragged_element_count_acc_zero" + self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "BATCH_AND_SIZE_OUTPUT" + outputs = [grpcclient.InferRequestedOutput(output_name)] + + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + expected_value = np.asarray([[2, 4, 1, 3]], np.float32) + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertTrue( + np.array_equal(output_data, expected_value), + "Expect response {} to have value {}, got {}".format( + idx, expected_value, output_data + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + def test_accumulated_element_count(self): + model_name = "ragged_acc_shape" + self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "BATCH_AND_SIZE_OUTPUT" + outputs = [grpcclient.InferRequestedOutput(output_name)] + + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + expected_value = np.asarray([[2, 6, 7, 10]], np.float32) + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertTrue( + np.array_equal(output_data, expected_value), + "Expect response {} to have value {}, got {}".format( + idx, expected_value, output_data + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + def test_accumulated_element_count_with_zero(self): + model_name = "ragged_element_count_acc_zero" + self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "BATCH_OUTPUT" + outputs = [grpcclient.InferRequestedOutput(output_name)] + + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + expected_value = np.asarray([[0, 2, 6, 7, 10]], np.float32) + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertTrue( + np.array_equal(output_data, expected_value), + "Expect response {} to have value {}, got {}".format( + idx, expected_value, output_data + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + def test_max_element_count_as_shape(self): + model_name = "ragged_acc_shape" + self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "BATCH_OUTPUT" + outputs = [grpcclient.InferRequestedOutput(output_name)] + + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertEqual( + output_data.shape, + (1, 4), + "Expect response {} to have shape to represent max element count {} among the batch , got {}".format( + idx, 4, output_data.shape + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + def test_batch_item_shape_flatten(self): + # Use 4 set of inputs with shape + # [1, 4, 1], [1, 1, 2], [1, 1, 2], [1, 2, 2] + # Note that the test only checks the formation of "BATCH_INPUT" where + # the value of "RAGGED_INPUT" is irrelevant, only the shape matters + self.set_inputs_for_batch_item( + [[1, 4, 1], [1, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT" + ) + + model_name = "batch_item_flatten" + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "BATCH_OUTPUT" + outputs = [grpcclient.InferRequestedOutput(output_name)] + + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + expected_value = np.asarray([[4, 1, 1, 2, 1, 2, 2, 2]], np.float32) + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertTrue( + np.array_equal(output_data, expected_value), + "Expect response {} to have value {}, got {}".format( + idx, expected_value, output_data + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + def test_batch_item_shape(self): + # Use 3 set of inputs with shape [2, 1, 2], [1, 1, 2], [1, 2, 2] + # Note that the test only checks the formation of "BATCH_INPUT" where + # the value of "RAGGED_INPUT" is irrelevant, only the shape matters + self.set_inputs_for_batch_item( + [[2, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT" + ) + + expected_outputs = [ + np.array([[1.0, 2.0], [1.0, 2.0]]), + np.array([[1.0, 2.0]]), + np.array([[2.0, 2.0]]), + ] + + model_name = "batch_item" + user_data = queue.Queue() + self.client.start_stream(callback=partial(self.client_callback, user_data)) + + output_name = "BATCH_OUTPUT" + outputs = [grpcclient.InferRequestedOutput(output_name)] + + async_requests = [] + try: + for input in self.inputs: + # Asynchronous inference call. + async_requests.append( + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) + + for idx in range(len(async_requests)): + # Get the result from the initiated asynchronous inference request. + # Note the call will block till the server responds. + result = user_data.get() + + # Validate the results by comparing with precomputed values. + output_data = result.as_numpy(output_name) + self.assertTrue( + np.allclose(output_data, expected_outputs[idx]), + "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}".format( + expected_outputs[idx], + output_data, + np.isclose(expected_outputs[idx], output_data), + ), + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.client.stop_stream() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_batch_input/test.sh b/qa/L0_batch_input/test.sh new file mode 100755 index 0000000000..e780516ec4 --- /dev/null +++ b/qa/L0_batch_input/test.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +BATCH_INPUT_TEST=batch_input_test.py +EXPECTED_NUM_TESTS="8" + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_ragged_model_repository +IDENTITY_DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository + +TEST_RESULT_FILE='test_results.txt' +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --exit-timeout-secs=120" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="onnx savedmodel plan libtorch"} + +rm -f $SERVER_LOG $CLIENT_LOG + +RET=0 +for BACKEND in $BACKENDS; do + rm -rf models && mkdir models + cp -r $DATADIR/${BACKEND}_batch_input models/ragged_element_count_acc_zero + (cd models/ragged_element_count_acc_zero && \ + sed -i "s/${BACKEND}_batch_input/ragged_element_count_acc_zero/" config.pbtxt) + cp -r $DATADIR/${BACKEND}_batch_input models/ragged_acc_shape + (cd models/ragged_acc_shape && \ + sed -i "s/${BACKEND}_batch_input/ragged_acc_shape/" config.pbtxt && \ + sed -i "s/BATCH_ELEMENT_COUNT/BATCH_ACCUMULATED_ELEMENT_COUNT/" config.pbtxt && \ + sed -i "s/BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO/BATCH_MAX_ELEMENT_COUNT_AS_SHAPE/" config.pbtxt) + cp -r $DATADIR/${BACKEND}_batch_input models/batch_item_flatten + (cd models/batch_item_flatten && \ + sed -i "s/${BACKEND}_batch_input/batch_item_flatten/" config.pbtxt && \ + sed -i "0,/-1/{s/-1/-1, -1/}" config.pbtxt && \ + sed -i "s/BATCH_ELEMENT_COUNT/BATCH_ACCUMULATED_ELEMENT_COUNT/" config.pbtxt && \ + sed -i "s/BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO/BATCH_ITEM_SHAPE_FLATTEN/" config.pbtxt) + cp -r $DATADIR/${BACKEND}_batch_item models/batch_item + (cd models/batch_item && \ + sed -i "s/${BACKEND}_batch_item/batch_item/" config.pbtxt) + # Use nobatch model to showcase ragged input, identity model to verify + # batch input is generated properly + cp -r $IDENTITY_DATADIR/${BACKEND}_nobatch_zero_1_float32 models/ragged_io + (cd models/ragged_io && \ + # In case of libtorch, update I/O names + sed -i "s/__0/0/" config.pbtxt && \ + sed -i "s/${BACKEND}_nobatch_zero_1_float32/ragged_io/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 4/" config.pbtxt && \ + sed -i "s/name: \"INPUT0\"/name: \"INPUT0\"\\nallow_ragged_batch: true/" config.pbtxt && \ + echo "batch_output [{target_name: \"OUTPUT0\" \ + kind: BATCH_SCATTER_WITH_INPUT_SHAPE \ + source_input: \"INPUT0\" }] \ + dynamic_batching { max_queue_delay_microseconds: 1000000 }" >> config.pbtxt) + + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + python3 $BATCH_INPUT_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_batcher/batcher_test.py b/qa/L0_batcher/batcher_test.py old mode 100644 new mode 100755 index a2328d7443..38e208c21e --- a/qa/L0_batcher/batcher_test.py +++ b/qa/L0_batcher/batcher_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,179 +27,487 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + sys.path.append("../common") -from builtins import range -from future.utils import iteritems import os -import time import threading -import traceback +import time import unittest -import numpy as np +from builtins import range + import infer_util as iu +import numpy as np import test_util as tu -from tensorrtserver.api import * -import tensorrtserver.api.server_status_pb2 as server_status +import tritonclient.grpc as grpcclient + +# By default, find tritonserver on "localhost", but can be overridden +# with TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) + +if TEST_SYSTEM_SHARED_MEMORY: + import tritonclient.utils.shared_memory as shm +if TEST_CUDA_SHARED_MEMORY: + import tritonclient.utils.cuda_shared_memory as cudashm + +# Test with either GRPC of HTTP, but not both since when we check +# results we expect only one to run +USE_GRPC = os.environ.get("USE_GRPC", 1) != "0" +USE_HTTP = os.environ.get("USE_HTTP", 1) != "0" +if USE_GRPC and USE_HTTP: + USE_GRPC = False +assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero" + +BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx libtorch plan python") -_trials = ("graphdef", "plan", "netdef") -_max_queue_delay = 10000 -_check_exception = None +_trials = BACKENDS.split(" ") -class BatcherTest(unittest.TestCase): +_ragged_batch_supported_trials = ["custom"] +if "plan" in _trials: + _ragged_batch_supported_trials.append("plan") +if "onnx" in _trials: + _ragged_batch_supported_trials.append("onnx") +if "libtorch" in _trials: + _ragged_batch_supported_trials.append("libtorch") + +_max_queue_delay_ms = 10000 + +_deferred_exceptions_lock = threading.Lock() +_deferred_exceptions = [] + + +class BatcherTest(tu.TestResultCollector): def setUp(self): - global _check_exception - _check_exception = None + # The helper client for setup will be GRPC for simplicity. + self.triton_client_ = grpcclient.InferenceServerClient( + f"{_tritonserver_ipaddr}:8001" + ) + self.precreated_shm_regions_ = [] + global _deferred_exceptions + _deferred_exceptions = [] + + def tearDown(self): + if TEST_SYSTEM_SHARED_MEMORY: + self.triton_client_.unregister_system_shared_memory() + if TEST_CUDA_SHARED_MEMORY: + self.triton_client_.unregister_cuda_shared_memory() + for precreated_shm_region in self.precreated_shm_regions_: + if TEST_SYSTEM_SHARED_MEMORY: + shm.destroy_shared_memory_region(precreated_shm_region) + elif TEST_CUDA_SHARED_MEMORY: + cudashm.destroy_shared_memory_region(precreated_shm_region) + super().tearDown() + + # FIXME why only used for outputs + def create_advance(self, shm_regions=None): + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + precreated_shm_regions = [] + if shm_regions is None: + shm_regions = ["output0", "output1"] + for shm_region in shm_regions: + if TEST_SYSTEM_SHARED_MEMORY: + shm_handle = shm.create_shared_memory_region( + shm_region + "_data", "/" + shm_region, 512 + ) + self.triton_client_.register_system_shared_memory( + shm_region + "_data", "/" + shm_region, 512 + ) + else: + shm_handle = cudashm.create_shared_memory_region( + shm_region + "_data", 512, 0 + ) + self.triton_client_.register_cuda_shared_memory( + shm_region + "_data", cudashm.get_raw_handle(shm_handle), 0, 512 + ) + # Collect precreated handles for cleanup + self.precreated_shm_regions_.append(shm_handle) + precreated_shm_regions.append(shm_handle) + return precreated_shm_regions + return [] + + def add_deferred_exception(self, ex): + global _deferred_exceptions + with _deferred_exceptions_lock: + _deferred_exceptions.append(ex) def check_deferred_exception(self): - if _check_exception is not None: - raise _check_exception + # Just raise one of the exceptions... + with _deferred_exceptions_lock: + if len(_deferred_exceptions) > 0: + raise _deferred_exceptions[0] - def check_response(self, trial, bs, less_than, threshold_ms, - requested_outputs=("OUTPUT0", "OUTPUT1")): - global _check_exception + def check_response( + self, + trial, + bs, + thresholds, + requested_outputs=("OUTPUT0", "OUTPUT1"), + input_size=16, + shm_region_names=None, + precreated_shm_regions=None, + ): try: - input_size = 16 - start_ms = int(round(time.time() * 1000)) - if trial == "graphdef" or trial == "netdef": - tensor_shape = (input_size,) - iu.infer_exact(self, trial, tensor_shape, bs, True, - np.float32, np.float32, np.float32, swap=True, - outputs=requested_outputs, - use_grpc=False, skip_request_id_check=True) - elif trial == "plan": - tensor_shape = (input_size,1,1) - iu.infer_exact(self, trial, tensor_shape, bs, True, - np.float32, np.float32, np.float32, swap=True, - outputs=requested_outputs, - use_grpc=False, skip_request_id_check=True) + if ( + trial == "savedmodel" + or trial == "graphdef" + or trial == "libtorch" + or trial == "onnx" + or trial == "plan" + or trial == "python" + ): + tensor_shape = (bs, input_size) + iu.infer_exact( + self, + trial, + tensor_shape, + bs, + np.float32, + np.float32, + np.float32, + swap=False, + model_version=1, + outputs=requested_outputs, + use_http_json_tensors=False, + use_grpc=USE_GRPC, + use_http=USE_HTTP, + skip_request_id_check=True, + use_streaming=False, + shm_region_names=shm_region_names, + precreated_shm_regions=precreated_shm_regions, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) else: self.assertFalse(True, "unknown trial type: " + trial) end_ms = int(round(time.time() * 1000)) - if less_than: - self.assertTrue((end_ms - start_ms) < threshold_ms, - "expected less than " + str(threshold_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") - else: - self.assertTrue((end_ms - start_ms) > threshold_ms, - "expected greater than " + str(threshold_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + + lt_ms = thresholds[0] + gt_ms = thresholds[1] + if lt_ms is not None: + self.assertTrue( + (end_ms - start_ms) < lt_ms, + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + if gt_ms is not None: + self.assertTrue( + (end_ms - start_ms) > gt_ms, + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) except Exception as ex: - _check_exception = ex + self.add_deferred_exception(ex) - def check_setup(self, url, protocol, model_name): + def check_setup(self, model_name, preferred_batch_sizes, max_queue_delay_us): # Make sure test.sh set up the correct batcher settings - ctx = ServerStatusContext(url, protocol, model_name, True) - ss = ctx.get_server_status() - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(model_name in ss.model_status, - "expected status for model " + model_name) - bconfig = ss.model_status[model_name].config.dynamic_batching - self.assertTrue(2 in bconfig.preferred_batch_size) - self.assertTrue(6 in bconfig.preferred_batch_size) - self.assertEqual(bconfig.max_queue_delay_microseconds, 10000000) # 10 secs - - def check_status(self, url, protocol, model_name, static_bs, exec_cnt, infer_cnt): - ctx = ServerStatusContext(url, protocol, model_name, True) - ss = ctx.get_server_status() - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(model_name in ss.model_status, - "expected status for model " + model_name) - vs = ss.model_status[model_name].version_status - self.assertEqual(len(vs), 2) # *_float32_float32_float32 has 2 versions - self.assertTrue(3 in vs, "expected status for version 3") - infer = vs[3].infer_stats - self.assertEqual(len(infer), len(static_bs), - "expected batch-sizes (" + ",".join(str(b) for b in static_bs) + - "), got " + str(vs[3])) - for b in static_bs: - self.assertTrue(b in infer, - "expected batch-size " + str(b) + ", got " + str(vs[3])) - self.assertEqual(vs[3].model_execution_count, exec_cnt, - "expected model-execution-count " + str(exec_cnt) + ", got " + - str(vs[3].model_execution_count)) - self.assertEqual(vs[3].model_inference_count, infer_cnt, - "expected model-inference-count " + str(infer_cnt) + ", got " + - str(vs[3].model_inference_count)) + config = self.triton_client_.get_model_config(model_name).config + bconfig = config.dynamic_batching + self.assertEqual(len(bconfig.preferred_batch_size), len(preferred_batch_sizes)) + for i in preferred_batch_sizes: + self.assertTrue(i in bconfig.preferred_batch_size) + self.assertEqual(bconfig.max_queue_delay_microseconds, max_queue_delay_us) + + def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, exec_count): + # There is a time window between when responses are returned and statistics are updated. + # To prevent intermittent test failure during that window, wait up to 10 seconds for the + # inference statistics to be ready. + num_tries = 10 + for i in range(num_tries): + stats = self.triton_client_.get_inference_statistics(model_name, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + actual_exec_cnt = stats.model_stats[0].execution_count + if actual_exec_cnt in exec_count: + break + print( + "WARNING: expect {} executions, got {} (attempt {})".format( + exec_count, actual_exec_cnt, i + ) + ) + time.sleep(1) + + self.assertEqual( + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) + + if batch_exec: + batch_stats = stats.model_stats[0].batch_stats + self.assertEqual( + len(batch_stats), + len(batch_exec), + "expected {} different batch-sizes, got {}".format( + len(batch_exec), len(batch_stats) + ), + ) + + for batch_stat in batch_stats: + bs = batch_stat.batch_size + bc = batch_stat.compute_infer.count + self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs)) + # Get count from one of the stats + self.assertEqual( + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) + + actual_request_cnt = stats.model_stats[0].inference_stats.success.count + self.assertEqual( + actual_request_cnt, + request_cnt, + "expected model-request-count {}, got {}".format( + request_cnt, actual_request_cnt + ), + ) + + actual_exec_cnt = stats.model_stats[0].execution_count + self.assertIn( + actual_exec_cnt, + exec_count, + "expected model-exec-count {}, got {}".format(exec_count, actual_exec_cnt), + ) + + actual_infer_cnt = stats.model_stats[0].inference_count + self.assertEqual( + actual_infer_cnt, + infer_cnt, + "expected model-inference-count {}, got {}".format( + infer_cnt, actual_infer_cnt + ), + ) def test_static_batch_preferred(self): # Send two requests with static batch sizes == preferred # size. This should cause the responses to be returned # immediately + precreated_shm_regions = self.create_advance() for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.check_response(trial, 2, True, 3000) - self.check_response(trial, 6, True, 3000) + self.check_response( + trial, + 2, + (3000, None), + precreated_shm_regions=precreated_shm_regions, + ) + self.check_response( + trial, + 6, + (3000, None), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() - self.check_status(url, protocol, model_name, (2,6), 2, 8) - except InferenceServerException as ex: + self.check_status(model_name, {2: 1, 6: 1}, 2, 8, (2,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_static_batch_lt_any_preferred(self): # Send a request with a static batch size < any preferred # size. This should cause the response to be delayed by the # max batch queue delay + precreated_shm_regions = self.create_advance() for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.check_response(trial, 1, False, _max_queue_delay) + self.check_response( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() - self.check_status(url, protocol, model_name, (1,), 1, 1) - except InferenceServerException as ex: + self.check_status(model_name, {1: 1}, 1, 1, (1,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_static_batch_not_preferred(self): # Send a request with a static batch size in between preferred # sizes. This should cause the response to be delayed by the # max batch queue delay + precreated_shm_regions = self.create_advance() for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.check_response(trial, 3, False, _max_queue_delay) + self.check_response( + trial, + 3, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() - self.check_status(url, protocol, model_name, (3,), 1, 3) - except InferenceServerException as ex: + self.check_status(model_name, {3: 1}, 1, 3, (1,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_static_batch_gt_max_preferred(self): # Send a request with a static batch size > maximum preferred # size. This should cause the request to be issued immediately # (even though the maximum batching queue delay is very high). + precreated_shm_regions = self.create_advance() for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.check_response(trial, 7, True, 3000) + self.check_response( + trial, + 7, + (3000, None), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() - self.check_status(url, protocol, model_name, (7,), 1, 7) - except InferenceServerException as ex: + self.check_status(model_name, {7: 1}, 1, 7, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_batch_different_shape_allow_ragged(self): + # Send two requests with static batch sizes == preferred size, + # but with different shapes (using model with variable-size + # tensors). Input tensors are marked as allowing ragged batch + # so requests should be batched. + for trial in _ragged_batch_supported_trials: + try: + dtype = np.float32 + model_name = tu.get_zero_model_name(trial, 1, dtype) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + + threads = [] + threads.append( + threading.Thread( + target=iu.infer_zero, + args=(self, trial, 1, dtype, ([1, 16],), ([1, 16],)), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + }, + ) + ) + threads.append( + threading.Thread( + target=iu.infer_zero, + args=(self, trial, 1, dtype, ([1, 8],), ([1, 8],)), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + }, + ) + ) + threads[0].start() + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {2: 1}, 2, 2, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_batch_different_shape(self): + # Send two requests with sum of static batch sizes == + # preferred size, but with different shapes (using model with + # variable-size tensors). This should cause the requests to + # not be batched. The first response will come back + # immediately and the second delayed by the max batch queue + # delay + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "input_size": 16, + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "input_size": 8, + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {1: 2}, 2, 2, (2,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_batch_not_preferred(self): @@ -206,147 +516,442 @@ def test_multi_batch_not_preferred(self): # delayed by the max batch queue delay, and the second by max # delay (minus the difference in time that they arrived in the # queue) + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, False, _max_queue_delay))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 3, False, _max_queue_delay - 2000))) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 3, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms - 2000), + ), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (1,3), 1, 4) - except InferenceServerException as ex: + self.check_status(model_name, {4: 1}, 2, 4, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_batch_not_preferred_different_shape(self): + # Send two requests with total static batch size in between + # preferred sizes. Then send a request with a different shape + # and a non-preferred batch size. This should cause the first + # two requests to be immediately responded to and the third + # response to be delayed by the max batch queue delay. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 3, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "input_size": 8, + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads[0].start() + threads[1].start() + time.sleep(1) + threads[2].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {1: 1, 4: 1}, 3, 5, (2,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_batch_preferred_different_shape(self): + # Send two requests with total static batch size in between + # preferred sizes. Then send a request with a different shape + # and a non-preferred batch size. This should cause the first + # two requests to be immediately responded to. Send a forth + # request with the same shape as the third that causes a + # preferred size so that third and forth response are sent + # immediately. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + shm3_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 3, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "input_size": 8, + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 5, (6000, None)), + kwargs={ + "input_size": 8, + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) + threads[0].start() + threads[1].start() + time.sleep(1) + threads[2].start() + threads[3].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {4: 1, 6: 1}, 4, 10, (2,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_batch_gt_max_preferred(self): # Send two requests with first not having preferred size and - # second being larger than max preferred size. This should cause - # both responses to be returned immediately. + # second being larger than max preferred size. Delay the + # second request so that it arrives after the first is already + # be processed by the dynamic batcher. This should cause both + # responses to be returned immediately. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 3, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 7, True, 3000))) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 3, (3000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 7, (3000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (3, 7), 2, 10) - except InferenceServerException as ex: + self.check_status(model_name, {3: 1, 7: 1}, 2, 10, (2,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_batch_sum_gt_max_preferred(self): # Send two requests with first not having preferred size and # second being smaller than max preferred size but the sum of - # the requests being larger than max preferred size. This - # should cause first response to be returned immediately but - # the second response, since it alone is not greater than max - # preferred size, will be delayed. + # the requests being larger than max preferred size. Delay the + # second request so that it arrives after the first is already + # be processed by the dynamic batcher. This should cause first + # response to be returned immediately but the second response, + # since it alone is not greater than max preferred size, will + # be delayed. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 3, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 4, False, _max_queue_delay))) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 3, (3000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 4, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (3,4), 2, 7) - except InferenceServerException as ex: + self.check_status(model_name, {3: 1, 4: 1}, 2, 7, (2,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_same_output0(self): # Send two requests where both ask for OUTPUT0. They should be # batched and get the correct response even though they don't # request both outputs. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00"] + shm1_region_names = ["ip10", "ip11", "op10"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op00"]) + precreated_shm1_regions = self.create_advance(["op10"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000), - kwargs={'requested_outputs': ("OUTPUT0",)})) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000), - kwargs={'requested_outputs': ("OUTPUT0",)})) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (3000, None)), + kwargs={ + "requested_outputs": ("OUTPUT0",), + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (3000, None)), + kwargs={ + "requested_outputs": ("OUTPUT0",), + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() threads[1].start() for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (1,), 1, 2) - except InferenceServerException as ex: + self.check_status(model_name, {2: 1}, 2, 2, (1,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_same_output1(self): # Send two requests where both ask for OUTPUT1. They should be # batched and get the correct response even though they don't # request both outputs. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op01"] + shm1_region_names = ["ip10", "ip11", "op11"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op01"]) + precreated_shm1_regions = self.create_advance(["op11"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000), - kwargs={'requested_outputs': ("OUTPUT1",)})) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000), - kwargs={'requested_outputs': ("OUTPUT1",)})) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (3000, None)), + kwargs={ + "requested_outputs": ("OUTPUT1",), + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (3000, None)), + kwargs={ + "requested_outputs": ("OUTPUT1",), + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() threads[1].start() for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (1,), 1, 2) - except InferenceServerException as ex: + self.check_status(model_name, {2: 1}, 2, 2, (1,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_different_outputs(self): @@ -354,70 +959,451 @@ def test_multi_different_outputs(self): # the other request asks for the other output. They should be # batched and get the correct response even though they don't # request both outputs. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00"] + shm1_region_names = ["ip10", "ip11", "op11"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op00"]) + precreated_shm1_regions = self.create_advance(["op11"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "requested_outputs": ("OUTPUT0",), + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "requested_outputs": ("OUTPUT1",), + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads[0].start() + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {2: 1}, 2, 2, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_different_output_order(self): + # Send two requests that ask for both outputs, but in a + # different order. They should be batched and get the correct + # response even though they use different order. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op11", "op10"] + else: + shm0_region_names = None + shm1_region_names = None + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + + self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "requested_outputs": ("OUTPUT0", "OUTPUT1"), + "shm_region_names": shm0_region_names, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "requested_outputs": ("OUTPUT1", "OUTPUT0"), + "shm_region_names": shm1_region_names, + }, + ) + ) + threads[0].start() + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {2: 1}, 2, 2, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_batch_delayed_sum_gt_max_preferred(self): + # Send two requests with first not having preferred size and + # second being smaller than max preferred size but the sum of + # the requests being larger than max preferred size. Use + # TRITONSERVER_DELAY_SCHEDULER in the environment so that + # requests can be queued up before scheduler starts + # servicing. This should cause first response to be returned + # immediately but the second response, since it alone is not + # greater than max preferred size, will be delayed. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + else: + shm0_region_names = None + shm1_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + + # Need scheduler to wait for queue to contain 2 requests + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 3, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 4, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {3: 1, 4: 1}, 2, 7, (2,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_batch_delayed_use_max_batch(self): + # Send three requests with first not having preferred size, + # second being smaller than max preferred size but the sum of + # the requests being larger than max preferred size and third + # is sent after the first two requests exceeds the queue delay + # and the sum of the requests to be in full batch. Use + # TRITONSERVER_DELAY_SCHEDULER in the environment so that + # requests can be queued up before scheduler starts + # servicing. This should cause all response to be returned together, + # while it appears that the first two responses to be returned + # after being delayed and the third response to be returned immediately. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) + + # Need scheduler to wait for queue to contain 3 requests + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 3, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 4, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads[0].start() + threads[1].start() + time.sleep(11) + threads[2].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {8: 1}, 3, 8, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_multi_batch_delayed_preferred_different_shape(self): + # Send two requests with total static batch size in between + # preferred sizes. Then send a request with a different shape + # and a non-preferred batch size. Use + # TRITONSERVER_DELAY_SCHEDULER in the environment so that + # requests can be queued up before scheduler starts + # servicing. This should cause the first two requests to be + # immediately responded to. Send a forth request with the same + # shape as the third that causes a preferred size so that + # third and forth response are sent immediately. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + shm3_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) - self.assertFalse("TRTSERVER_DELAY_SCHEDULER" in os.environ) + # Need scheduler to wait for queue to contain 4 requests + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000), - kwargs={'requested_outputs': ("OUTPUT0",)})) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000), - kwargs={'requested_outputs': ("OUTPUT1",)})) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (3000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 3, (3000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (3000, None)), + kwargs={ + "input_size": 8, + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 5, (3000, None)), + kwargs={ + "input_size": 8, + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) threads[0].start() threads[1].start() + time.sleep(1) + threads[2].start() + threads[3].start() for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (1,), 1, 2) - except InferenceServerException as ex: + self.check_status(model_name, {4: 1, 6: 1}, 4, 10, (2,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_batch_use_biggest_preferred(self): # Send multiple requests that sum to multiple preferred sizes - # and make sure the largest preferred size if used for the - # batch. Requires TRTSERVER_DELAY_SCHEDULER in the environment - # so that requests can be queued up before scheduler starts + # and make sure the largest preferred size is used for the + # batch. Use TRITONSERVER_DELAY_SCHEDULER in the environment so + # that requests can be queued up before scheduler starts # servicing. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + shm4_region_names = ["ip40", "ip41", "op40", "op41"] + shm5_region_names = ["ip50", "ip51", "op50", "op51"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + shm3_region_names = None + shm4_region_names = None + shm5_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + precreated_shm4_regions = self.create_advance(["op40", "op41"]) + precreated_shm5_regions = self.create_advance(["op50", "op51"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) # Need scheduler to wait for queue to contain 6 request - self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 6) + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 6) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm4_region_names, + "precreated_shm_regions": precreated_shm4_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm5_region_names, + "precreated_shm_regions": precreated_shm5_regions, + }, + ) + ) for t in threads: t.start() for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (1,), 1, 6) - except InferenceServerException as ex: + self.check_status(model_name, {6: 1}, 6, 6, (1,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_multi_batch_use_best_preferred(self): @@ -425,28 +1411,67 @@ def test_multi_batch_use_best_preferred(self): # preferred size and then extra request goes beyond that. The # initial requests should be handled immediately at the # preferred batch size and then the other one after - # timeout. Requires TRTSERVER_DELAY_SCHEDULER in the - # environment so that requests can be queued up before - # scheduler starts servicing. + # timeout. Use TRITONSERVER_DELAY_SCHEDULER in the environment so + # that requests can be queued up before scheduler starts + # servicing. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) for trial in _trials: try: - url = "localhost:8000" - protocol = ProtocolType.HTTP - model_name = tu.get_model_name(trial, np.float32, np.float32, np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) - self.check_setup(url, protocol, model_name) + self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) # Need scheduler to wait for queue to contain 3 requests - self.assertTrue("TRTSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual(int(os.environ["TRTSERVER_DELAY_SCHEDULER"]), 3) + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) threads = [] - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, True, 3000))) - threads.append(threading.Thread(target=self.check_response, - args=(trial, 1, False, _max_queue_delay))) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), + kwargs={ + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads[0].start() threads[1].start() time.sleep(1) @@ -454,9 +1479,512 @@ def test_multi_batch_use_best_preferred(self): for t in threads: t.join() self.check_deferred_exception() - self.check_status(url, protocol, model_name, (1,), 2, 3) - except InferenceServerException as ex: + self.check_status(model_name, {2: 1, 1: 1}, 3, 3, (2,)) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': + def test_multi_batch_preserve_ordering(self): + model_base = "custom" + dtype = np.float32 + shapes = ( + [ + 1, + 1, + ], + ) + + try: + # use threads to send 12 requests without waiting for response + threads = [] + for i in range(12): + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm_region_name_prefix = ["input" + str(i), "output" + str(i)] + else: + shm_region_name_prefix = None + threads.append( + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + "shm_region_name_prefix": shm_region_name_prefix, + "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY, + "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + model_name = tu.get_zero_model_name(model_base, len(shapes), dtype) + self.check_status(model_name, {4: 3}, 12, 12, (3,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_preferred_batch_only_aligned(self): + # Send 4 requests with batch size 1. Use + # TRITONSERVER_DELAY_SCHEDULER in the environment so that + # requests can be queued up before scheduler starts + # servicing. The batcher should form a batch of preferred + # size 4. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + shm3_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [4, 6], 0) + + # Need scheduler to wait for queue to contain 4 requests + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {4: 1}, 4, 4, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_preferred_batch_only_unaligned(self): + # Send 5 requests with batch size 1. Use + # TRITONSERVER_DELAY_SCHEDULER in the environment so that + # requests can be queued up before scheduler starts + # servicing. The batcher should form a batch of preferred + # size 4 followed by a batch of size 1. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + shm4_region_names = ["ip40", "ip41", "op40", "op41"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + shm3_region_names = None + shm4_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + precreated_shm4_regions = self.create_advance(["op40", "op41"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [4, 6], 0) + + # Need scheduler to wait for queue to contain 3 requests + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 5) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm4_region_names, + "precreated_shm_regions": precreated_shm4_regions, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {4: 1, 1: 1}, 5, 5, (2,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_preferred_batch_only_use_biggest_preferred(self): + # Send 7 requests with batch size 1. Use + # TRITONSERVER_DELAY_SCHEDULER in the environment so that + # requests can be queued up before scheduler starts + # servicing. The batcher should form a batch of largest preferred + # size 6 followed by a batch of size 1. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + shm4_region_names = ["ip40", "ip41", "op40", "op41"] + shm5_region_names = ["ip50", "ip51", "op50", "op51"] + shm6_region_names = ["ip60", "ip61", "op60", "op61"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + shm3_region_names = None + shm4_region_names = None + shm5_region_names = None + shm6_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + precreated_shm4_regions = self.create_advance(["op40", "op41"]) + precreated_shm5_regions = self.create_advance(["op50", "op51"]) + precreated_shm6_regions = self.create_advance(["op60", "op61"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [4, 6], 0) + + # Need scheduler to wait for queue to contain 6 request + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 7) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm4_region_names, + "precreated_shm_regions": precreated_shm4_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm5_region_names, + "precreated_shm_regions": precreated_shm5_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm6_region_names, + "precreated_shm_regions": precreated_shm6_regions, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {6: 1, 1: 1}, 7, 7, (2,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_preferred_batch_only_use_no_preferred_size(self): + # Send 3 requests with batch size 1. Use + # TRITONSERVER_DELAY_SCHEDULER in the environment so that + # requests can be queued up before scheduler starts + # servicing. The batcher should form a batch of of 3. + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + else: + shm0_region_names = None + shm1_region_names = None + shm2_region_names = None + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + for trial in _trials: + try: + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) + + self.check_setup(model_name, [4, 6], 0) + + # Need scheduler to wait for queue to contain 3 request + self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {3: 1}, 3, 3, (1,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_max_queue_delay_only_non_default(self): + # Send 12 requests with batch size 1. The max_queue_delay is set + # to non-zero. Depending upon the timing of the requests arrival + # there can be either 1 or 2 model executions. + model_base = "custom" + dtype = np.float32 + shapes = ( + [ + 1, + 1, + ], + ) + + try: + # use threads to send 12 requests without waiting for response + threads = [] + for i in range(12): + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm_region_name_prefix = ["input" + str(i), "output" + str(i)] + else: + shm_region_name_prefix = None + threads.append( + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + "shm_region_name_prefix": shm_region_name_prefix, + "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY, + "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + model_name = tu.get_zero_model_name(model_base, len(shapes), dtype) + self.check_status(model_name, None, 12, 12, (1, 2)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_max_queue_delay_only_default(self): + # Send 12 requests with batch size 1. The max_queue_delay is set + # to default value of 0. There should be two distinct model + # executions. The first few requests will form a first batch + # and the remaining requests will form the second batch. + model_base = "custom" + dtype = np.float32 + shapes = ( + [ + 1, + 1, + ], + ) + + try: + # use threads to send 12 requests without waiting for response + threads = [] + for i in range(12): + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + shm_region_name_prefix = ["input" + str(i), "output" + str(i)] + else: + shm_region_name_prefix = None + threads.append( + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + "shm_region_name_prefix": shm_region_name_prefix, + "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY, + "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY, + }, + ) + ) + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + model_name = tu.get_zero_model_name(model_base, len(shapes), dtype) + self.check_status(model_name, None, 12, 12, (2,)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_batcher/queue_timeout_test.py b/qa/L0_batcher/queue_timeout_test.py new file mode 100755 index 0000000000..886bf52a03 --- /dev/null +++ b/qa/L0_batcher/queue_timeout_test.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import concurrent.futures +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class TestMaxQueueDelayTimeout(unittest.TestCase): + def setUp(self): + # Initialize client + self._triton = grpcclient.InferenceServerClient("localhost:8001") + + def _get_inputs(self, batch_size): + self.assertIsInstance(batch_size, int) + self.assertGreater(batch_size, 0) + shape = [batch_size, 8] + inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")] + inputs[0].set_data_from_numpy(np.ones(shape, dtype=np.float32)) + return inputs + + def _generate_callback_and_response_pair(self): + response = {"responded": False, "result": None, "error": None} + + def callback(result, error): + response["responded"] = True + response["result"] = result + response["error"] = error + + return callback, response + + # Test queued requests on dynamic batch scheduler can be cancelled + def test_default_queue_policy_timeout_prompt_response(self): + model_name = "dynamic_batch" + with concurrent.futures.ThreadPoolExecutor() as pool: + # Saturate the slots on the model + saturate_thread = pool.submit( + self._triton.infer, model_name, self._get_inputs(batch_size=1) + ) + time.sleep(2) # ensure the slots are filled + # The next request should be queued + callback, response = self._generate_callback_and_response_pair() + self._triton.async_infer( + model_name, self._get_inputs(batch_size=1), callback + ) + time.sleep(2) # ensure the request is queued + # Check if the request has timed-out + time.sleep(2) # ensure the timeout period has expired + self.assertTrue(response["responded"]) + self.assertEqual(response["result"], None) + self.assertIsInstance(response["error"], InferenceServerException) + self.assertEqual(response["error"].status(), "StatusCode.UNAVAILABLE") + self.assertEqual(response["error"].message(), "Request timeout expired") + # Join saturating thread + saturate_thread.result() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh index 7d72cc88f0..7043aab2a5 100755 --- a/qa/L0_batcher/test.sh +++ b/qa/L0_batcher/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,45 +25,367 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# Must run on a single device or else the TRITONSERVER_DELAY_SCHEDULER +# can fail when the requests are distributed to multiple devices. +export CUDA_VISIBLE_DEVICES=0 + CLIENT_LOG="./client.log" BATCHER_TEST=batcher_test.py +VERIFY_TIMESTAMPS=verify_timestamps.py +TEST_RESULT_FILE='test_results.txt' + +if [ -z "$TEST_VALGRIND" ]; then + TEST_VALGRIND="0" +fi + +if [ -z "$TEST_CUDA_SHARED_MEMORY" ]; then + TEST_CUDA_SHARED_MEMORY="0" +fi + +# Add valgrind flag check +if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK=/usr/bin/valgrind + LEAKCHECK_ARGS_BASE="--leak-check=full --show-leak-kinds=definite --max-threads=3000" + SERVER_TIMEOUT=3600 + rm -f *.valgrind.log + + NO_DELAY_TESTS="test_static_batch_preferred \ + test_multi_batch_sum_gt_max_preferred \ + test_multi_same_output0 \ + test_multi_different_output_order" + + DELAY_TESTS="test_multi_batch_use_biggest_preferred \ + test_multi_batch_use_best_preferred" -DATADIR=/data/inferenceserver + DIFFERENT_SHAPE_TESTS="test_multi_batch_not_preferred_different_shape \ + test_multi_batch_different_shape_allow_ragged" +fi + +TF_VERSION=${TF_VERSION:=2} + +# On windows the paths invoked by the script (running in WSL) must use +# /mnt/c when needed but the paths on the tritonserver command-line +# must be C:/ style. +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then + MODELDIR=${MODELDIR:=C:/models} + DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} + BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} + SERVER=${SERVER:=/mnt/c/tritonserver/bin/tritonserver.exe} + export WSLENV=$WSLENV:TRITONSERVER_DELAY_SCHEDULER +else + MODELDIR=${MODELDIR:=`pwd`} + DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} + SERVER=${TRITON_DIR}/bin/tritonserver + BACKEND_DIR=${TRITON_DIR}/backends + + # PyTorch on SBSA requires libgomp to be loaded first. See the following + # GitHub issue for more information: + # https://github.com/pytorch/pytorch/issues/2575 + arch=`uname -m` + if [ $arch = "aarch64" ]; then + SERVER_LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libgomp.so.1 + fi +fi -SERVER=/opt/tensorrtserver/bin/trtserver +SERVER_ARGS_EXTRA="--backend-directory=${BACKEND_DIR} --backend-config=tensorflow,version=${TF_VERSION}" source ../common/util.sh RET=0 -# Setup model store -rm -fr *.log *.serverlog models && mkdir models -for m in \ - graphdef_float32_float32_float32 \ - netdef_float32_float32_float32 \ - plan_float32_float32_float32 ; do - cp -r $DATADIR/qa_model_repository/$m models/. && - (cd models/$m && \ - sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ - echo "dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >> config.pbtxt) +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch plan python"} +export BACKENDS + +# Basic batcher tests +NO_DELAY_TESTS=${NO_DELAY_TESTS:="test_static_batch_preferred \ + test_static_batch_lt_any_preferred \ + test_static_batch_not_preferred \ + test_static_batch_gt_max_preferred \ + test_multi_batch_not_preferred \ + test_multi_batch_gt_max_preferred \ + test_multi_batch_sum_gt_max_preferred \ + test_multi_same_output0 \ + test_multi_same_output1 \ + test_multi_different_outputs \ + test_multi_different_output_order"} + +# Tests that use scheduler delay +DELAY_TESTS=${DELAY_TESTS:="test_multi_batch_delayed_sum_gt_max_preferred \ + test_multi_batch_use_biggest_preferred \ + test_multi_batch_use_best_preferred \ + test_multi_batch_delayed_use_max_batch"} + +# Tests with different shapes +DIFFERENT_SHAPE_TESTS=${DIFFERENT_SHAPE_TESTS:="test_multi_batch_not_preferred_different_shape \ + test_multi_batch_preferred_different_shape \ + test_multi_batch_different_shape_allow_ragged \ + test_multi_batch_different_shape"} + +# Test with preferred batch sizes but default max_queue_delay +PREFERRED_BATCH_ONLY_TESTS=${PREFERRED_BATCH_ONLY_TESTS:="test_preferred_batch_only_aligned \ + test_preferred_batch_only_unaligned \ + test_preferred_batch_only_use_biggest_preferred \ + test_preferred_batch_only_use_no_preferred_size"} + +# Tests with varying delay for max queue but no preferred batch size +MAX_QUEUE_DELAY_ONLY_TESTS=${MAX_QUEUE_DELAY_ONLY_TESTS:="test_max_queue_delay_only_default \ + test_max_queue_delay_only_non_default"} + +# Setup non-variable-size model repository +rm -fr *.log models && mkdir models +for BACKEND in $BACKENDS; do + TMP_MODEL_DIR="$DATADIR/qa_model_repository/${BACKEND}_float32_float32_float32" + if [ "$BACKEND" == "python" ]; then + # We will be using ONNX models config.pbtxt and tweak them to make them + # appropriate for Python backend + onnx_model="${DATADIR}/qa_model_repository/onnx_float32_float32_float32" + python_model=`echo $onnx_model | sed 's/onnx/python/g' | sed 's,'"$DATADIR/qa_model_repository/"',,g'` + mkdir -p models/$python_model/1/ + cat $onnx_model/config.pbtxt | sed 's/platform:.*/backend:\ "python"/g' | sed 's/onnx/python/g' > models/$python_model/config.pbtxt + cp $onnx_model/output0_labels.txt models/$python_model + cp ../python_models/add_sub/model.py models/$python_model/1/ + else + cp -r $TMP_MODEL_DIR models/. + fi + (cd models/$(basename $TMP_MODEL_DIR) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/^version_policy:.*/version_policy: { specific { versions: [1] }}/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >> config.pbtxt) +done + +rm -fr preferred_batch_only_models && mkdir preferred_batch_only_models +for BACKEND in $BACKENDS; do + TMP_MODEL_DIR="$DATADIR/qa_model_repository/${BACKEND}_float32_float32_float32" + if [ "$BACKEND" == "python" ]; then + # We will be using ONNX models config.pbtxt and tweak them to make them + # appropriate for Python backend + onnx_model="${DATADIR}/qa_model_repository/onnx_float32_float32_float32" + python_model=`echo $onnx_model | sed 's/onnx/python/g' | sed 's,'"$DATADIR/qa_model_repository/"',,g'` + mkdir -p preferred_batch_only_models/$python_model/1/ + cat $onnx_model/config.pbtxt | sed 's/platform:.*/backend:\ "python"/g' | sed 's/onnx/python/g' > preferred_batch_only_models/$python_model/config.pbtxt + cp $onnx_model/output0_labels.txt preferred_batch_only_models/$python_model + cp ../python_models/add_sub/model.py preferred_batch_only_models/$python_model/1/ + else + cp -r $TMP_MODEL_DIR preferred_batch_only_models/. + fi + (cd preferred_batch_only_models/$(basename $TMP_MODEL_DIR) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/^version_policy:.*/version_policy: { specific { versions: [1] }}/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [ 4, 6 ] }" >> config.pbtxt) +done + +# Setup variable-size model repository +rm -fr var_models && mkdir var_models +for BACKEND in $BACKENDS; do + TMP_MODEL_DIR="$DATADIR/qa_variable_model_repository/${BACKEND}_float32_float32_float32" + if [ "$BACKEND" == "python" ]; then + # We will be using ONNX models config.pbtxt and tweak them to make them + # appropriate for Python backend + onnx_model="${DATADIR}/qa_variable_model_repository/onnx_float32_float32_float32" + python_model=`echo $onnx_model | sed 's/onnx/python/g' | sed 's,'"$DATADIR/qa_variable_model_repository/"',,g'` + mkdir -p var_models/$python_model/1/ + cat $onnx_model/config.pbtxt | sed 's/platform:.*/backend:\ "python"/g' | sed 's/onnx/python/g' > var_models/$python_model/config.pbtxt + cp $onnx_model/output0_labels.txt var_models/$python_model + cp ../python_models/add_sub/model.py var_models/$python_model/1/ + else + cp -r $TMP_MODEL_DIR var_models/. + fi + (cd var_models/$(basename $TMP_MODEL_DIR) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/^version_policy:.*/version_policy: { specific { versions: [1] }}/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >> config.pbtxt) +done + +for MC in `ls var_models/*/config.pbtxt`; do + sed -i "s/16/-1/g" $MC done +# Create allow-ragged model to variable-size model repository +cp -r ../custom_models/custom_zero_1_float32 var_models/. && \ + (cd var_models/custom_zero_1_float32 && mkdir 1 && \ + echo "instance_group [ { kind: KIND_GPU count: 1 }]" >> config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + sed -i "s/name:.*\"INPUT0\"/name: \"INPUT0\"\\nallow_ragged_batch: true/" config.pbtxt && \ + sed -i "s/^version_policy:.*/version_policy: { specific { versions: [1] }}/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >> config.pbtxt) + +if [[ $BACKENDS == *"plan"* ]]; then + # Use nobatch model to match the ragged test requirement + cp -r $DATADIR/qa_identity_model_repository/plan_nobatch_zero_1_float32 var_models/plan_zero_1_float32 && \ + (cd var_models/plan_zero_1_float32 && \ + sed -i "s/nobatch_//" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/name: \"INPUT0\"/name: \"INPUT0\"\\nallow_ragged_batch: true/" config.pbtxt && \ + echo "batch_output [{target_name: \"OUTPUT0\" \ + kind: BATCH_SCATTER_WITH_INPUT_SHAPE \ + source_input: \"INPUT0\" }] \ + dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >> config.pbtxt) +fi + +if [[ $BACKENDS == *"onnx"* ]]; then + # Use nobatch model to match the ragged test requirement + cp -r $DATADIR/qa_identity_model_repository/onnx_nobatch_zero_1_float32 var_models/onnx_zero_1_float32 && \ + (cd var_models/onnx_zero_1_float32 && \ + sed -i "s/nobatch_//" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/name: \"INPUT0\"/name: \"INPUT0\"\\nallow_ragged_batch: true/" config.pbtxt && \ + echo "batch_output [{target_name: \"OUTPUT0\" \ + kind: BATCH_SCATTER_WITH_INPUT_SHAPE \ + source_input: \"INPUT0\" }] \ + dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >> config.pbtxt) +fi + +if [[ $BACKENDS == *"libtorch"* ]]; then + # Use nobatch model to match the ragged test requirement + cp -r $DATADIR/qa_identity_model_repository/libtorch_nobatch_zero_1_float32 var_models/libtorch_zero_1_float32 && \ + (cd var_models/libtorch_zero_1_float32 && \ + sed -i "s/nobatch_//" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/name: \"INPUT__0\"/name: \"INPUT__0\"\\nallow_ragged_batch: true/" config.pbtxt && \ + echo "batch_output [{target_name: \"OUTPUT__0\" \ + kind: BATCH_SCATTER_WITH_INPUT_SHAPE \ + source_input: \"INPUT__0\" }] \ + dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >> config.pbtxt) +fi + # Need to launch the server for each test so that the model status is # reset (which is used to make sure the correctly batch size was used -# for execution) -for i in \ - test_static_batch_preferred \ - test_static_batch_lt_any_preferred \ - test_static_batch_not_preferred \ - test_static_batch_gt_max_preferred \ - test_multi_batch_not_preferred \ - test_multi_batch_gt_max_preferred \ - test_multi_batch_sum_gt_max_preferred \ - test_multi_same_output0 \ - test_multi_same_output1 \ - test_multi_different_outputs ; do - SERVER_ARGS="--model-store=`pwd`/models" - SERVER_LOG="./$i.serverlog" - run_server +# for execution). Test everything with fixed-tensor-size models and +# variable-tensor-size models. + +for model_type in FIXED VARIABLE; do + export BATCHER_TYPE=$model_type + MODEL_PATH=models && [[ "$model_type" == "VARIABLE" ]] && MODEL_PATH=var_models + for i in $NO_DELAY_TESTS ; do + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.$model_type.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$model_type.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, $model_type" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e + done + + # Tests that require TRITONSERVER_DELAY_SCHEDULER so that the + # scheduler is delayed and requests can collect in the queue. + for i in $DELAY_TESTS ; do + export TRITONSERVER_DELAY_SCHEDULER=6 && + [[ "$i" != "test_multi_batch_use_biggest_preferred" ]] && export TRITONSERVER_DELAY_SCHEDULER=3 && + [[ "$i" != "test_multi_batch_use_best_preferred" ]] && + [[ "$i" != "test_multi_batch_delayed_use_max_batch" ]] && export TRITONSERVER_DELAY_SCHEDULER=2 + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.$model_type.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$model_type.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_DELAY_SCHEDULER + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e + done +done + +export BATCHER_TYPE=VARIABLE +for i in $DIFFERENT_SHAPE_TESTS ; do + SERVER_ARGS="--model-repository=$MODELDIR/var_models ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.VARIABLE.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.VARIABLE.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG @@ -73,27 +395,50 @@ for i in \ echo "Test: $i" >>$CLIENT_LOG set +e - python $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 + python3 $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** Test Failed\n***" RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi fi set -e - kill $SERVER_PID - wait $SERVER_PID + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e done -# Tests that require TRTSERVER_DELAY_SCHEDULER so that the scheduler -# is delayed and requests can collect in the queue. +# Tests that run only on the variable-size tensor models and that +# require TRITONSERVER_DELAY_SCHEDULER so that the scheduler is delayed +# and requests can collect in the queue. +export BATCHER_TYPE=VARIABLE for i in \ - test_multi_batch_use_biggest_preferred \ - test_multi_batch_use_best_preferred ; do - export TRTSERVER_DELAY_SCHEDULER=6 && - [[ "$i" == "test_multi_batch_use_best_preferred" ]] && export TRTSERVER_DELAY_SCHEDULER=3 - SERVER_ARGS="--model-store=`pwd`/models" - SERVER_LOG="./$i.serverlog" - run_server + test_multi_batch_delayed_preferred_different_shape ; do + export TRITONSERVER_DELAY_SCHEDULER=4 + SERVER_ARGS="--model-repository=$MODELDIR/var_models ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.VARIABLE.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.VARIABLE.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG @@ -103,26 +448,331 @@ for i in \ echo "Test: $i" >>$CLIENT_LOG set +e - python $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 + python3 $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** Test Failed\n***" RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi fi set -e - unset TRTSERVER_DELAY_SCHEDULER - kill $SERVER_PID - wait $SERVER_PID + unset TRITONSERVER_DELAY_SCHEDULER + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e done -# python unittest seems to swallow ImportError and still return 0 exit -# code. So need to explicitly check CLIENT_LOG to make sure we see -# some running tests -grep -c "HTTP/1.1 200 OK" $CLIENT_LOG +export BATCHER_TYPE=FIXED +for i in $PREFERRED_BATCH_ONLY_TESTS ; do + export TRITONSERVER_DELAY_SCHEDULER=4 && + [[ "$i" != "test_preferred_batch_only_aligned" ]] && export TRITONSERVER_DELAY_SCHEDULER=5 && + [[ "$i" != "test_preferred_batch_only_unaligned" ]] && export TRITONSERVER_DELAY_SCHEDULER=7 && + [[ "$i" != "test_preferred_batch_only_use_biggest_preferred" ]] && export TRITONSERVER_DELAY_SCHEDULER=3 + SERVER_ARGS="--model-repository=$MODELDIR/preferred_batch_only_models ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.PREFERRED_BATCH_ONLY.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.PREFERRED_BATCH_ONLY.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_DELAY_SCHEDULER + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e +done + +# Test cases that checks the runtime batches created with max_queue_delay +# specification only. +rm -fr ./custom_models && mkdir ./custom_models && \ +cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \ +mkdir -p ./custom_models/custom_zero_1_float32/1 + +# Provide sufficient delay to allow forming of next batch. +(cd custom_models/custom_zero_1_float32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + sed -i "s/max_batch_size:.*/max_batch_size: 100/g" config.pbtxt && \ + echo "dynamic_batching { max_queue_delay_microseconds: 0}" >> config.pbtxt && \ + echo "instance_group [ { kind: KIND_GPU } ]" >> config.pbtxt && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"100\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +for i in $MAX_QUEUE_DELAY_ONLY_TESTS ; do + export MAX_QUEUE_DELAY_MICROSECONDS=20000 && + [[ "$i" != "test_max_queue_delay_only_non_default" ]] && export MAX_QUEUE_DELAY_MICROSECONDS=0 + (cd custom_models/custom_zero_1_float32 && \ + sed -i "s/max_queue_delay_microseconds:.*\[.*\]/max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_MICROSECONDS}/g" config.pbtxt ) + + SERVER_ARGS="--model-repository=$MODELDIR/custom_models ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.MAX_QUEUE_DELAY_ONLY.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.MAX_QUEUE_DELAY_ONLY.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST BatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + kill_server + unset MAX_QUEUE_DELAY_MICROSECONDS + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e +done + +# Test that verify the 'preserve_ordering' option in dynamic batcher +# Run the test scheme with and without preserve ordering, verify behavior +# by comparing the "response send" timestamps. +TEST_CASE=test_multi_batch_preserve_ordering + +# Skip test for Windows. Trace file concats at 8192 chars on Windows. +if [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then + rm -fr ./custom_models && mkdir ./custom_models && \ + cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \ + mkdir -p ./custom_models/custom_zero_1_float32/1 + + # Two instances will be created for the custom model, one delays 100 ms while + # the other delays 400 ms + (cd custom_models/custom_zero_1_float32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + sed -i "s/max_batch_size:.*/max_batch_size: 4/g" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [ 4 ] }" >> config.pbtxt && \ + echo "instance_group [ { kind: KIND_GPU count: 2 }]" >> config.pbtxt && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"100\" }}," >> config.pbtxt && \ + echo "{ key: \"instance_wise_delay_multiplier\"; value: { string_value: \"4\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + + # enqueue 3 batches to guarantee that a large delay batch will be followed by + # a small delay one regardless of the order issued to model instances. + # i.e. the 3 batches will be queued: [1, 2, 3] and there are two delay instances + # [small, large], then the distributions can be the following: + # [1:small 2:large 3:small] or [1:large 2:small 3:*] (* depends on whether order + # is preserved), and we only interested in the timestamps where the large delay + # batch is followed by small delay batch + export TRITONSERVER_DELAY_SCHEDULER=12 + + # not preserve + SERVER_ARGS="--trace-file=not_preserve.log --trace-level=MIN --trace-rate=1 --model-repository=$MODELDIR/custom_models ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./not_preserve.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./not_preserve.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: not_preserve" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST BatcherTest.$TEST_CASE >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + + python3 $VERIFY_TIMESTAMPS not_preserve.log + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + # preserve + (cd custom_models/custom_zero_1_float32 && \ + sed -i "s/dynamic_batching.*/dynamic_batching { preferred_batch_size: [ 4 ] preserve_ordering: true }/g" config.pbtxt) + + SERVER_ARGS="--trace-file=preserve.log --trace-level=MIN --trace-rate=1 --model-repository=$MODELDIR/custom_models ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./preserve.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./preserve.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: preserve" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST BatcherTest.$TEST_CASE >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + + python3 $VERIFY_TIMESTAMPS -p preserve.log + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + unset TRITONSERVER_DELAY_SCHEDULER +fi + +# Test requests should be returned immediately upon timeout, without waiting for +# the next slot to be available and then returned. +rm -rf models && mkdir models +mkdir -p models/dynamic_batch/1 && (cd models/dynamic_batch && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 1' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'instance_group [{ count: 1 \n kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'dynamic_batching {' >> config.pbtxt && \ + echo -e ' preferred_batch_size: [ 1 ]' >> config.pbtxt && \ + echo -e ' default_queue_policy { timeout_action: REJECT \n default_timeout_microseconds: 1000000 \n max_queue_size: 8 }' >> config.pbtxt && \ + echo -e '}' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "8000" } }]' >> config.pbtxt) + +TEST_LOG="queue_timeout_test.log" +SERVER_LOG="./queue_timeout_test.server.log" + +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python queue_timeout_test.py > $TEST_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test Failed To Run\n***" + echo -e "\n***\n*** Queue Timeout Tests Failed\n***" + cat $TEST_LOG RET=1 fi +set -e + +kill $SERVER_PID +wait $SERVER_PID if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" @@ -132,3 +782,4 @@ else fi exit $RET + diff --git a/qa/L0_batcher/verify_timestamps.py b/qa/L0_batcher/verify_timestamps.py new file mode 100755 index 0000000000..3271135fcd --- /dev/null +++ b/qa/L0_batcher/verify_timestamps.py @@ -0,0 +1,126 @@ +#!/usr/bin/python +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json + +FLAGS = None + + +def verify_timestamps(traces, preserve): + # Order traces by id + traces = sorted(traces, key=lambda t: t.get("id", -1)) + + # Filter the trace that is not meaningful and group them by 'id' + filtered_traces = dict() + grpc_id_offset = 0 + for trace in traces: + if "id" not in trace: + continue + # Skip GRPC traces as actual traces are not generated via GRPC, + # thus GRPC traces are ill-formed + if "timestamps" in trace: + is_grpc = False + for ts in trace["timestamps"]: + if "GRPC" in ts["name"]: + is_grpc = True + break + if is_grpc: + grpc_id_offset += 1 + continue + + if trace["id"] in filtered_traces.keys(): + rep_trace = filtered_traces[trace["id"]] + # Append the timestamp to the trace representing this 'id' + if "timestamps" in trace: + rep_trace["timestamps"] += trace["timestamps"] + else: + # Use this trace to represent this 'id' + if "timestamps" not in trace: + trace["timestamps"] = [] + filtered_traces[trace["id"]] = trace + + # First find the latest response complete timestamp for the batch with large delay + large_delay_response_complete = 0 + small_delay_traces = [] + for trace_id, trace in filtered_traces.items(): + timestamps = dict() + for ts in trace["timestamps"]: + timestamps[ts["name"]] = ts["ns"] + # Hardcoded delay value here (knowing large delay is 400ms) + compute_span = timestamps["COMPUTE_END"] - timestamps["COMPUTE_START"] + # If the 3rd batch is also processed by large delay instance, we don't + # want to use its responses as baseline + if trace["id"] <= (8 + grpc_id_offset) and compute_span >= 400 * 1000 * 1000: + response_complete = timestamps["INFER_RESPONSE_COMPLETE"] + large_delay_response_complete = max( + large_delay_response_complete, response_complete + ) + else: + small_delay_traces.append(trace) + + response_request_after_large_delay_count = 0 + for trace in small_delay_traces: + timestamps = dict() + for ts in trace["timestamps"]: + timestamps[ts["name"]] = ts["ns"] + response_complete = timestamps["INFER_RESPONSE_COMPLETE"] + if response_complete > large_delay_response_complete: + response_request_after_large_delay_count += 1 + + # Hardcoded expected count here + print( + "responses after large delay count: {}".format( + response_request_after_large_delay_count + ) + ) + if preserve: + # If preserve ordering, there must be large delay batch followed by + # small delay batch and thus at least 4 responses are sent after + return 0 if response_request_after_large_delay_count >= 4 else 1 + else: + # If not preserve ordering, the small delay batches should all be done + # before large delay batch regardless of the ordering in scheduler + return 0 if response_request_after_large_delay_count == 0 else 1 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-p", + "--preserve", + action="store_true", + required=False, + default=False, + help="Timestamps is collected with preserve ordering", + ) + parser.add_argument("file", type=argparse.FileType("r"), nargs="+") + FLAGS = parser.parse_args() + + for f in FLAGS.file: + trace_data = json.loads(f.read()) + exit(verify_timestamps(trace_data, FLAGS.preserve)) diff --git a/qa/L0_buffer_attributes/buffer_attributes_test.py b/qa/L0_buffer_attributes/buffer_attributes_test.py new file mode 100755 index 0000000000..7d61e082c5 --- /dev/null +++ b/qa/L0_buffer_attributes/buffer_attributes_test.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +import tritonclient.utils.cuda_shared_memory as cudashm +from tritonclient.utils import triton_to_np_dtype + + +class BufferAttributesTest(tu.TestResultCollector): + def test_buffer_attributes(self): + model_name = "bls" + + # Infer + clients = [ + httpclient.InferenceServerClient(url="localhost:8000"), + grpcclient.InferenceServerClient(url="localhost:8001"), + ] + triton_clients = [httpclient, grpcclient] + for i, client in enumerate(clients): + # To make sure no shared memory regions are registered with the + # server. + client.unregister_system_shared_memory() + client.unregister_cuda_shared_memory() + + triton_client = triton_clients[i] + inputs = [] + outputs = [] + inputs.append(triton_client.InferInput("INPUT0", [1, 1000], "INT32")) + + input0_data = np.arange(start=0, stop=1000, dtype=np.int32) + input0_data = np.expand_dims(input0_data, axis=0) + + input_byte_size = input0_data.size * input0_data.itemsize + output_byte_size = input_byte_size + + shm_ip0_handle = cudashm.create_shared_memory_region( + "input0_data", input_byte_size, 0 + ) + shm_op0_handle = cudashm.create_shared_memory_region( + "output0_data", output_byte_size, 0 + ) + + client.register_cuda_shared_memory( + "input0_data", + cudashm.get_raw_handle(shm_ip0_handle), + 0, + input_byte_size, + ) + client.register_cuda_shared_memory( + "output0_data", + cudashm.get_raw_handle(shm_op0_handle), + 0, + input_byte_size, + ) + + cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data]) + inputs[0].set_shared_memory("input0_data", input_byte_size) + + if triton_client is grpcclient: + outputs.append(triton_client.InferRequestedOutput("OUTPUT0")) + outputs[0].set_shared_memory("output0_data", output_byte_size) + else: + outputs.append( + triton_client.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs[0].set_shared_memory("output0_data", output_byte_size) + + results = client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + + output0 = results.get_output("OUTPUT0") + self.assertIsNotNone(output0) + if triton_client is grpcclient: + output0_data = cudashm.get_contents_as_numpy( + shm_op0_handle, triton_to_np_dtype(output0.datatype), output0.shape + ) + else: + output0_data = cudashm.get_contents_as_numpy( + shm_op0_handle, + triton_to_np_dtype(output0["datatype"]), + output0["shape"], + ) + self.assertTrue(np.all(output0_data == input0_data)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_buffer_attributes/models/bls/1/model.py b/qa/L0_buffer_attributes/models/bls/1/model.py new file mode 100644 index 0000000000..2d3e78e936 --- /dev/null +++ b/qa/L0_buffer_attributes/models/bls/1/model.py @@ -0,0 +1,54 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +# Simple Python model that executes a BLS request on an identity model. +class TritonPythonModel: + def execute(self, requests): + responses = [] + for request in requests: + # Get INPUT0 + input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + infer_request = pb_utils.InferenceRequest( + model_name="identity", + requested_output_names=["OUTPUT0"], + inputs=[input0], + ) + infer_response = infer_request.exec() + + if infer_response.has_error(): + raise pb_utils.TritonModelException(infer_response.error().message()) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + ] + ) + responses.append(inference_response) + + return responses diff --git a/qa/L0_buffer_attributes/models/bls/config.pbtxt b/qa/L0_buffer_attributes/models/bls/config.pbtxt new file mode 100644 index 0000000000..d0d6cd3260 --- /dev/null +++ b/qa/L0_buffer_attributes/models/bls/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls" +backend: "python" +max_batch_size: 64 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 1000 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 1000 ] + } +] +instance_group [{ kind: KIND_GPU }] + +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} diff --git a/qa/L0_buffer_attributes/models/identity/1/model.py b/qa/L0_buffer_attributes/models/identity/1/model.py new file mode 100644 index 0000000000..2d4b592ae3 --- /dev/null +++ b/qa/L0_buffer_attributes/models/identity/1/model.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Identity model using DLPack in Python backend. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor.from_dlpack( + "OUTPUT0", input_tensor.to_dlpack() + ) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/L0_buffer_attributes/models/identity/config.pbtxt b/qa/L0_buffer_attributes/models/identity/config.pbtxt new file mode 100644 index 0000000000..aa9d63e68c --- /dev/null +++ b/qa/L0_buffer_attributes/models/identity/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity" +backend: "python" +max_batch_size: 64 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 1000 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 1000 ] + } +] +instance_group [{ kind: KIND_GPU }] + +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} diff --git a/qa/L0_buffer_attributes/test.sh b/qa/L0_buffer_attributes/test.sh new file mode 100755 index 0000000000..7e2f35d837 --- /dev/null +++ b/qa/L0_buffer_attributes/test.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +CLIENT_LOG="./buffer_attributes_client.log" +TEST_PY=./buffer_attributes_test.py +EXPECTED_NUM_TESTS="1" +TEST_RESULT_FILE='test_results.txt' + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST_PY >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh new file mode 100755 index 0000000000..9dc1c4c85d --- /dev/null +++ b/qa/L0_client_build_variants/test.sh @@ -0,0 +1,212 @@ +#!/bin/bash +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Install required dependencies for client build +apt-get update && \ +apt-get install -y --no-install-recommends \ + rapidjson-dev + +# Client build requires recent version of CMake (FetchContent required) +# Using CMAKE installation instruction from:: https://apt.kitware.com/ +apt update -q=2 \ + && apt install -y gpg wget \ + && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \ + && . /etc/os-release \ + && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \ + && apt-get update -q=2 \ + && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* +cmake --version + + +set +e + +mkdir -p /workspace/build + +# +# Build without GPU support +# +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +(cd /workspace/build && \ + rm -fr cc-clients java-clients python-clients && \ + cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_ENABLE_CC_HTTP=ON \ + -DTRITON_ENABLE_CC_GRPC=ON \ + -DTRITON_ENABLE_PYTHON_HTTP=ON \ + -DTRITON_ENABLE_PYTHON_GRPC=ON \ + -DTRITON_ENABLE_JAVA_HTTP=ON \ + -DTRITON_ENABLE_EXAMPLES=ON \ + -DTRITON_ENABLE_TESTS=ON \ + -DTRITON_ENABLE_GPU=OFF \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ + /workspace/client && \ + make -j16 cc-clients java-clients python-clients) +if [ $? -eq 0 ]; then + echo -e "\n***\n*** No-GPU Passed\n***" +else + echo -e "\n***\n*** No-GPU FAILED\n***" + exit 1 +fi + +# +# Build without HTTP +# Skip this test for java-clients because we can only build +# java-clients with http protocol +# +(cd /workspace/build && \ + rm -fr cc-clients python-clients && \ + cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_ENABLE_CC_HTTP=OFF \ + -DTRITON_ENABLE_CC_GRPC=ON \ + -DTRITON_ENABLE_PYTHON_HTTP=OFF \ + -DTRITON_ENABLE_PYTHON_GRPC=ON \ + -DTRITON_ENABLE_EXAMPLES=ON \ + -DTRITON_ENABLE_TESTS=ON \ + -DTRITON_ENABLE_GPU=ON \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ + /workspace/client && \ + make -j16 cc-clients python-clients) +if [ $? -eq 0 ]; then + echo -e "\n***\n*** No-HTTP Passed\n***" +else + echo -e "\n***\n*** No-HTTP FAILED\n***" + exit 1 +fi + +# +# Build without GRPC +# Skip this test for java-clients because grpc protocol is not supported +# +(cd /workspace/build && \ + rm -fr cc-clients python-clients && \ + cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_ENABLE_CC_HTTP=ON \ + -DTRITON_ENABLE_CC_GRPC=OFF \ + -DTRITON_ENABLE_PYTHON_HTTP=ON \ + -DTRITON_ENABLE_PYTHON_GRPC=OFF \ + -DTRITON_ENABLE_EXAMPLES=ON \ + -DTRITON_ENABLE_TESTS=ON \ + -DTRITON_ENABLE_GPU=ON \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \ + /workspace/client && \ + make -j16 cc-clients python-clients) +if [ $? -eq 0 ]; then + echo -e "\n***\n*** No-GRPC Passed\n***" +else + echo -e "\n***\n*** No-GRPC FAILED\n***" + exit 1 +fi + +# TODO: TPRD-342 These tests should be PA CI test +# cases not Triton test cases +rm -fr /workspace/build +mkdir -p /workspace/build +# +# Build without C API in Perf Analyzer +# +(cd /workspace/build && \ + cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_ENABLE_CC_HTTP=ON \ + -DTRITON_ENABLE_CC_GRPC=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \ + -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \ + -DTRITON_ENABLE_GPU=ON \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) +if [ $? -eq 0 ]; then + echo -e "\n***\n*** No-CAPI Passed\n***" +else + echo -e "\n***\n*** No-CAPI FAILED\n***" + exit 1 +fi + +# +# Build without TensorFlow Serving in Perf Analyzer +# +(cd /workspace/build && \ + rm -fr cc_clients perf_analyzer && \ + cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_ENABLE_CC_HTTP=ON \ + -DTRITON_ENABLE_CC_GRPC=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \ + -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \ + -DTRITON_ENABLE_GPU=ON \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) +if [ $? -eq 0 ]; then + echo -e "\n***\n*** No-TF-Serving Passed\n***" +else + echo -e "\n***\n*** No-TF-Serving FAILED\n***" + exit 1 +fi + +# +# Build without TorchServe in Perf Analyzer +# +(cd /workspace/build && \ + rm -fr cc_clients perf_analyzer && \ + cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \ + -DTRITON_ENABLE_CC_HTTP=ON \ + -DTRITON_ENABLE_CC_GRPC=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \ + -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \ + -DTRITON_ENABLE_GPU=ON \ + -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \ + -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \ + -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \ + -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \ + /workspace/perf_analyzer && \ + make -j16 perf-analyzer) +if [ $? -eq 0 ]; then + echo -e "\n***\n*** No-TorchServe Passed\n***" +else + echo -e "\n***\n*** No-TorchServe FAILED\n***" + exit 1 +fi + +set -e + +echo -e "\n***\n*** Test Passed\n***" diff --git a/qa/L0_client_java/test.sh b/qa/L0_client_java/test.sh new file mode 100755 index 0000000000..0300ff1bce --- /dev/null +++ b/qa/L0_client_java/test.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server} +TRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG:="main"} + +RET=0 + +rm -f *.log.* + +# Get the proto files from the common repo +rm -fr common +git clone --single-branch --depth=1 -b $TRITON_COMMON_REPO_TAG \ + ${TRITON_REPO_ORGANIZATION}/common.git +cp common/protobuf/*.proto java/library/src/main/proto/. + +# Compile library +(cd java/library && \ + mvn compile && \ + cp -R target/generated-sources/protobuf/java/inference ../examples/src/main/java/inference && \ + cp -r target/generated-sources/protobuf/grpc-java/inference/*.java ../examples/src/main/java/inference/) + +# Build simple java and scala client example +(cd java/examples && mvn clean install) + +CLIENT_LOG=`pwd`/client.log +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +source ../common/util.sh + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +pushd java/examples + +# Test grpc_generated simple java client example +mvn exec:java -Dexec.mainClass=clients.SimpleJavaClient -Dexec.args="localhost 8001" >> ${CLIENT_LOG}.java 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.java + RET=1 +fi + +# Test grpc_generated simple scala client example +mvn exec:java -Dexec.mainClass=clients.SimpleClient -Dexec.args="localhost 8001" >> ${CLIENT_LOG}.scala 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.scala + RET=1 +fi + +popd + +# Test simple infer java client +SIMPLE_INFER_JAVA_CLIENT=../clients/SimpleInferClient.jar + +pushd ../clients + +java -jar ${SIMPLE_INFER_JAVA_CLIENT} >> ${CLIENT_LOG}.simple_infer_java 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.simple_infer_java + RET=1 +fi + +popd +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_client_memory_growth/client_memory_mail.py b/qa/L0_client_memory_growth/client_memory_mail.py new file mode 100755 index 0000000000..ef1703f2c3 --- /dev/null +++ b/qa/L0_client_memory_growth/client_memory_mail.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import glob +from datetime import date + +import nightly_email_helper + +if __name__ == "__main__": + today = date.today().strftime("%Y-%m-%d") + subject = "Triton Client Memory Growth " + sys.argv[1] + " Summary: " + today + memory_graphs = glob.glob("client_memory_growth*.log") + write_up = "

This test is run for both HTTP and GRPC protocols using C++ and Python test scripts. The max-allowed difference between mean and maximum memory usage is set to 10MB and 1MB for C++ and Python tests individually.

" + write_up += "

• What to look for
A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.

" + html_content = ( + '
'
+        + write_up
+        + '
'
+    )
+    for mem_graph in sorted(memory_graphs):
+        html_content += "\n" + mem_graph + "\n"
+        with open(mem_graph, "r") as f:
+            html_content += f.read() + "\n"
+    html_content += "
" + nightly_email_helper.send(subject, html_content, is_html=True) diff --git a/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt b/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt new file mode 100644 index 0000000000..6a2a76bde5 --- /dev/null +++ b/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_identity_int32" +backend: "identity" +max_batch_size: 1024 +version_policy: { latest { num_versions: 1 }} +instance_group [ { kind: KIND_CPU } ] + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] \ No newline at end of file diff --git a/qa/L0_client_memory_growth/test.sh b/qa/L0_client_memory_growth/test.sh new file mode 100755 index 0000000000..73188812b2 --- /dev/null +++ b/qa/L0_client_memory_growth/test.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# Must run on a single device or else the TRITONSERVER_DELAY_SCHEDULER +# can fail when the requests are distributed to multiple devices. +export CUDA_VISIBLE_DEVICES=0 + +LEAKCHECK=/usr/bin/valgrind +LEAKCHECK_ARGS_BASE="--max-threads=3000 --tool=massif --time-unit=B" +SERVER_TIMEOUT=3600 +rm -f *.log *.massif + +MEMORY_GROWTH_TEST_CPP=../clients/memory_leak_test +MEMORY_GROWTH_TEST_PY=../clients/memory_growth_test.py +MASSIF_TEST=../common/check_massif_log.py + +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +source ../common/util.sh + +# Set the number of repetitions in nightly and weekly tests +# Set the email subject for nightly and weekly tests +if [ "$TRITON_PERF_WEEKLY" == 1 ]; then + if [ "$TRITON_PERF_LONG" == 1 ]; then + # ~ 12 hours + # GRPC cycles are reduced as there is high fluctuation in time spent + REPETITION_HTTP_CPP=2220000 + REPETITION_HTTP_PY=3600000 + REPETITION_GRPC_CPP=8000000 + REPETITION_GRPC_PY=1500000 + EMAIL_SUBJECT="Weekly Long" + else + # Run the test for each case approximately 1.5 hours + # All tests are run cumulatively for 7 hours + REPETITION_HTTP_CPP=1300000 + REPETITION_HTTP_PY=2100000 + REPETITION_GRPC_CPP=6600000 + REPETITION_GRPC_PY=1000000 + EMAIL_SUBJECT="Weekly" + fi +else + REPETITION_CPP=100000 + REPETITION_PY=10000 + EMAIL_SUBJECT="Nightly" +fi + +mkdir -p $DATADIR/custom_identity_int32/1 + +RET=0 + +# Run test for both HTTP and GRPC, not re-using client object. +for PROTOCOL in http grpc; do + for LANG in c++ python; do + LEAKCHECK_LOG="./valgrind.${PROTOCOL}.${LANG}.log" + CLIENT_LOG="./client.${PROTOCOL}.${LANG}.log" + GRAPH_LOG="./client_memory_growth.${PROTOCOL}.${LANG}.log" + MASSIF_LOG="./${PROTOCOL}.${LANG}.massif" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG --massif-out-file=$MASSIF_LOG" + + if [ "$TRITON_PERF_WEEKLY" == 1 ]; then + if [ $PROTOCOL == http ]; then + REPETITION_CPP=$REPETITION_HTTP_CPP + REPETITION_PY=$REPETITION_HTTP_PY + else + REPETITION_CPP=$REPETITION_GRPC_CPP + REPETITION_PY=$REPETITION_GRPC_PY + fi + fi + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + # MAX_ALLOWED_ALLOC is the threshold memory growth in MB + if [ "$LANG" == "c++" ]; then + MEMORY_GROWTH_TEST=$MEMORY_GROWTH_TEST_CPP + MAX_ALLOWED_ALLOC="10" + # NOTE: This test has risk of exhausting all available sockets in + # the ephemeral port range. Re-using the same client connection + # ("-R") can easily solve this problem. However, to cleanly separate + # the resources used by different client objects, we create new + # connections for each request and retry/sleep on failure to give + # the system time to reclaim sockets after TIME_WAIT. + # TIP: You can use the "ss -s" command to observe the socket usage. + EXTRA_ARGS="-r ${REPETITION_CPP} -i ${PROTOCOL}" + else + MEMORY_GROWTH_TEST="python $MEMORY_GROWTH_TEST_PY" + MAX_ALLOWED_ALLOC="1" + EXTRA_ARGS="-r ${REPETITION_PY} -i ${PROTOCOL}" + fi + + set +e + SECONDS=0 + $LEAKCHECK $LEAKCHECK_ARGS $MEMORY_GROWTH_TEST $EXTRA_ARGS >> ${CLIENT_LOG} 2>&1 + TEST_RETCODE=$? + TEST_DURATION=$SECONDS + set -e + if [ ${TEST_RETCODE} -ne 0 ]; then + cat ${CLIENT_LOG} + RET=1 + echo -e "\n***\n*** Test FAILED\n***" + else + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Memory leak detected\n***" + RET=1 + fi + + set +e + # Check for memory growth + python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC >> ${CLIENT_LOG}.massif 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Massif Test for ${PROTOCOL} ${LANG} Failed\n***" + RET=1 + fi + + # Log test duration, the graph for memory growth and the change between Average and Max memory usage + hrs=$(printf "%02d" $((TEST_DURATION / 3600))) + mins=$(printf "%02d" $(((TEST_DURATION / 60) % 60))) + secs=$(printf "%02d" $((TEST_DURATION % 60))) + echo -e "Test Duration: $hrs:$mins:$secs (HH:MM:SS)" >> ${GRAPH_LOG} + cat ${CLIENT_LOG}.massif + ms_print ${MASSIF_LOG} | head -n35 >> ${GRAPH_LOG} + cat ${GRAPH_LOG} + set -e + fi + + # Stop Server + kill $SERVER_PID + wait $SERVER_PID + done +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +# Run only if both TRITON_FROM and TRITON_TO_DL are set +if [[ ! -z "$TRITON_FROM" ]] && [[ ! -z "$TRITON_TO_DL" ]]; then + python client_memory_mail.py "$EMAIL_SUBJECT" +fi + +exit $RET diff --git a/qa/L0_client_nobatch/client_test.py b/qa/L0_client_nobatch/client_test.py old mode 100644 new mode 100755 index 5faeaae604..3288fc2ebf --- a/qa/L0_client_nobatch/client_test.py +++ b/qa/L0_client_nobatch/client_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,101 +27,216 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + sys.path.append("../common") -from builtins import range -from future.utils import iteritems import unittest + import numpy as np -from tensorrtserver.api import * import test_util as tu +import tritonclient.grpc as tritongrpcclient +import tritonclient.http as tritonhttpclient +from tritonclient.utils import InferenceServerException + -class ClientNoBatchTest(unittest.TestCase): - def test_bs0_request_for_batching_model(self): +class ClientNoBatchTest(tu.TestResultCollector): + def test_nobatch_request_for_batching_model(self): input_size = 16 - tensor_shape = (input_size,) - # graphdef_int32_int8_int8 has a batching version. If we make - # a batch-size 0 request for that model we still allow it - # (treated as batch-size 1). - for protocol, url in ((ProtocolType.HTTP, 'localhost:8000'), - (ProtocolType.GRPC, 'localhost:8001')): + # graphdef_int32_int8_int8 has a batching version with max batch size of 8. + # The server should return an error if the batch size is not included in the + # input shapes. + tensor_shape = (input_size,) + for protocol in ["http", "grpc"]: model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) - ctx = InferContext(url, protocol, model_name, None, True) - results = ctx.run({ 'INPUT0' : (in0,), - 'INPUT1' : (in1,) }, - { 'OUTPUT0' : InferContext.ResultFormat.RAW, - 'OUTPUT1' : InferContext.ResultFormat.RAW }, - 0) + inputs = [] + outputs = [] + if protocol == "http": + triton_client = tritonhttpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + inputs.append( + tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1")) + else: + triton_client = tritongrpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + inputs.append( + tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) + + # Initialize the data + inputs[0].set_data_from_numpy(in0) + inputs[1].set_data_from_numpy(in1) - def test_bs0_request_for_non_batching_model(self): + try: + _ = triton_client.infer(model_name, inputs, outputs=outputs) + self.assertTrue( + False, "expected failure with no batch request for batching model" + ) + except InferenceServerException as ex: + pass + + def test_batch_request_for_nobatching_model(self): input_size = 16 - tensor_shape = (input_size,) - # graphdef_int32_int8_int8 has a non-batching version. If we - # make a batch-size zero request for that model it should - # pass. - for protocol, url in ((ProtocolType.HTTP, 'localhost:8000'), - (ProtocolType.GRPC, 'localhost:8001')): - model_name = tu.get_model_name("graphdef_nobatch", np.int32, np.int8, np.int8) + # graphdef_nobatch_int32_int8_int8 is non batching version. + # The server should return an error if the batch size dimension + # is included in the shape + tensor_shape = (1, input_size) + for protocol in ["http", "grpc"]: + model_name = tu.get_model_name( + "graphdef_nobatch", np.int32, np.int8, np.int8 + ) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) - ctx = InferContext(url, protocol, model_name, None, True) - results = ctx.run({ 'INPUT0' : (in0,), - 'INPUT1' : (in1,) }, - { 'OUTPUT0' : InferContext.ResultFormat.RAW, - 'OUTPUT1' : InferContext.ResultFormat.RAW }, - 0) + inputs = [] + outputs = [] + if protocol == "http": + triton_client = tritonhttpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + inputs.append( + tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1")) + else: + triton_client = tritongrpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + inputs.append( + tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) + + # Initialize the data + inputs[0].set_data_from_numpy(in0) + inputs[1].set_data_from_numpy(in1) + + try: + _ = triton_client.infer(model_name, inputs, outputs=outputs) + self.assertTrue( + False, + "expected failure with batched request for non-batching model", + ) + except InferenceServerException as ex: + pass - def test_bs1_request_for_non_batching_model(self): + def test_nobatch_request_for_nonbatching_model(self): input_size = 16 - tensor_shape = (input_size,) - # graphdef_int32_int8_int8 has a non-batching version. If we - # make a batch-size one request for that model it should - # pass. - for protocol, url in ((ProtocolType.HTTP, 'localhost:8000'), - (ProtocolType.GRPC, 'localhost:8001')): - model_name = tu.get_model_name("graphdef_nobatch", np.int32, np.int8, np.int8) + # graphdef_int32_int8_int8 has a batching version with max batch size of 8. + # The server should return an error if the batch size is not included in the + # input shapes. + tensor_shape = (input_size,) + for protocol in ["http", "grpc"]: + model_name = tu.get_model_name( + "graphdef_nobatch", np.int32, np.int8, np.int8 + ) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) - ctx = InferContext(url, protocol, model_name, None, True) - results = ctx.run({ 'INPUT0' : (in0,), - 'INPUT1' : (in1,) }, - { 'OUTPUT0' : InferContext.ResultFormat.RAW, - 'OUTPUT1' : InferContext.ResultFormat.RAW }, - 1) - - def test_bs2_request_for_non_batching_model(self): + inputs = [] + outputs = [] + if protocol == "http": + triton_client = tritonhttpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + inputs.append( + tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1")) + else: + triton_client = tritongrpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + inputs.append( + tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) + + # Initialize the data + inputs[0].set_data_from_numpy(in0) + inputs[1].set_data_from_numpy(in1) + + results = triton_client.infer(model_name, inputs, outputs=outputs) + + def test_batch_request_for_batching_model(self): input_size = 16 - tensor_shape = (input_size,) - # graphdef_int32_int8_int8 has a non-batching version. If we - # make a batch-size two (or greater) request for that model it - # should fail. - for protocol, url in ((ProtocolType.HTTP, 'localhost:8000'), - (ProtocolType.GRPC, 'localhost:8001')): - model_name = tu.get_model_name("graphdef_nobatch", np.int32, np.int8, np.int8) + # graphdef_nobatch_int32_int8_int8 is non batching version. + # The server should return an error if the batch size dimension + # is included in the shape + tensor_shape = (1, input_size) + for protocol in ["http", "grpc"]: + model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8) in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32) - try: - ctx = InferContext(url, protocol, model_name, None, True) - results = ctx.run({ 'INPUT0' : (in0,), - 'INPUT1' : (in1,) }, - { 'OUTPUT0' : InferContext.ResultFormat.RAW, - 'OUTPUT1' : InferContext.ResultFormat.RAW }, - 2) - self.assertTrue(False, "expected failure with batch-size 2 for non-batching model") - - except InferenceServerException as ex: - pass - - -if __name__ == '__main__': + inputs = [] + outputs = [] + if protocol == "http": + triton_client = tritonhttpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + inputs.append( + tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1")) + else: + triton_client = tritongrpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + inputs.append( + tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32") + ) + inputs.append( + tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32") + ) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) + + # Initialize the data + inputs[0].set_data_from_numpy(in0) + inputs[1].set_data_from_numpy(in1) + + results = triton_client.infer(model_name, inputs, outputs=outputs) + + +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_client_nobatch/test.sh b/qa/L0_client_nobatch/test.sh index 9f93030a4f..58b1b3dc58 100755 --- a/qa/L0_client_nobatch/test.sh +++ b/qa/L0_client_nobatch/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,13 +25,30 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' CLIENT_LOG="./client.log" CLIENT_TEST=client_test.py +EXPECTED_NUM_TESTS="4" -DATADIR=/data/inferenceserver +DATADIR=/data/inferenceserver/${REPO_VERSION} -SERVER=/opt/tensorrtserver/bin/trtserver -SERVER_ARGS=--model-store=$DATADIR/qa_model_repository +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR/qa_model_repository" SERVER_LOG="./inference_server.log" source ../common/util.sh @@ -55,13 +72,13 @@ if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 -fi - -grep -c "HTTP/1.1 200 OK" $CLIENT_LOG -if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Failed To Run\n***" - RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi fi set -e diff --git a/qa/L0_client_timeout/client_infer_timeout_test.py b/qa/L0_client_timeout/client_infer_timeout_test.py new file mode 100755 index 0000000000..700e9bfe9b --- /dev/null +++ b/qa/L0_client_timeout/client_infer_timeout_test.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import queue +import socket +import unittest +from functools import partial + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class ClientInferTimeoutTest(tu.TestResultCollector): + def setUp(self): + self.model_name_ = "custom_identity_int32" + self.input0_data_ = np.array([[10]], dtype=np.int32) + self.input0_data_byte_size_ = 32 + self.INFER_SMALL_INTERVAL = 2.0 # seconds for a timeout + + def _prepare_request(self, protocol): + if protocol == "grpc": + self.inputs_ = [] + self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "INT32")) + self.outputs_ = [] + self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0")) + else: + self.inputs_ = [] + self.inputs_.append(httpclient.InferInput("INPUT0", [1, 1], "INT32")) + self.outputs_ = [] + self.outputs_.append(httpclient.InferRequestedOutput("OUTPUT0")) + + self.inputs_[0].set_data_from_numpy(self.input0_data_) + + def test_grpc_infer(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + self._prepare_request("grpc") + + # The model is configured to take three seconds to send the + # response. Expect an exception for small timeout values. + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.infer( + model_name=self.model_name_, + inputs=self.inputs_, + outputs=self.outputs_, + client_timeout=0.2, + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + # Expect inference to pass successfully for a large timeout + # value + result = triton_client.infer( + model_name=self.model_name_, + inputs=self.inputs_, + outputs=self.outputs_, + client_timeout=10, + ) + + output0_data = result.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(self.input0_data_, output0_data)) + + def test_grpc_async_infer(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + self._prepare_request("grpc") + + user_data = UserData() + + # The model is configured to take three seconds to send the + # response. Expect an exception for small timeout values. + with self.assertRaises(InferenceServerException) as cm: + triton_client.async_infer( + model_name=self.model_name_, + inputs=self.inputs_, + callback=partial(callback, user_data), + outputs=self.outputs_, + client_timeout=self.INFER_SMALL_INTERVAL, + ) + data_item = user_data._completed_requests.get() + if type(data_item) == InferenceServerException: + raise data_item + self.assertIn("Deadline Exceeded", str(cm.exception)) + + # Expect inference to pass successfully for a large timeout + # value + triton_client.async_infer( + model_name=self.model_name_, + inputs=self.inputs_, + callback=partial(callback, user_data), + outputs=self.outputs_, + client_timeout=10, + ) + + # Wait until the results are available in user_data + data_item = user_data._completed_requests.get() + self.assertFalse(type(data_item) == InferenceServerException) + + output0_data = data_item.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(self.input0_data_, output0_data)) + + def test_grpc_stream_infer(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + + self._prepare_request("grpc") + user_data = UserData() + + # The model is configured to take three seconds to send the + # response. Expect an exception for small timeout values. + with self.assertRaises(InferenceServerException) as cm: + triton_client.stop_stream() + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=1 + ) + triton_client.async_stream_infer( + model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ + ) + data_item = user_data._completed_requests.get() + if type(data_item) == InferenceServerException: + raise data_item + self.assertIn("Deadline Exceeded", str(cm.exception)) + + # Expect inference to pass successfully for a large timeout + # value + triton_client.stop_stream() + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=100 + ) + + triton_client.async_stream_infer( + model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ + ) + data_item = user_data._completed_requests.get() + triton_client.stop_stream() + + if type(data_item) == InferenceServerException: + raise data_item + output0_data = data_item.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(self.input0_data_, output0_data)) + + def test_http_infer(self): + self._prepare_request("http") + + # The model is configured to take three seconds to send the + # response. Expect an exception for small timeout values. + with self.assertRaises(socket.timeout) as cm: + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", + verbose=True, + network_timeout=self.INFER_SMALL_INTERVAL, + ) + _ = triton_client.infer( + model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ + ) + self.assertIn("timed out", str(cm.exception)) + + # Expect to successfully pass with sufficiently large timeout + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=True, connection_timeout=10.0 + ) + + result = triton_client.infer( + model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ + ) + + output0_data = result.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(self.input0_data_, output0_data)) + + def test_http_async_infer(self): + self._prepare_request("http") + + # The model is configured to take three seconds to send the + # response. Expect an exception for small timeout values. + with self.assertRaises(socket.timeout) as cm: + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", + verbose=True, + network_timeout=self.INFER_SMALL_INTERVAL, + ) + async_request = triton_client.async_infer( + model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ + ) + result = async_request.get_result() + self.assertIn("timed out", str(cm.exception)) + + # Expect to successfully pass with sufficiently large timeout + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=True, connection_timeout=10.0 + ) + + async_request = triton_client.async_infer( + model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_ + ) + result = async_request.get_result() + + output0_data = result.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(self.input0_data_, output0_data)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_client_timeout/client_non_infer_timeout_test.py b/qa/L0_client_timeout/client_non_infer_timeout_test.py new file mode 100755 index 0000000000..bbaf8c34e8 --- /dev/null +++ b/qa/L0_client_timeout/client_non_infer_timeout_test.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class ClientNonInferTimeoutTest(tu.TestResultCollector): + def setUp(self): + self.model_name_ = "custom_identity_int32" + self.input0_data_ = np.array([[10]], dtype=np.int32) + self.input0_data_byte_size_ = 32 + self.SMALL_INTERVAL = 0.1 # seconds for a timeout + self.NORMAL_INTERVAL = 5.0 # seconds for server to load then receive request + + def test_grpc_server_live(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.is_server_live(client_timeout=self.SMALL_INTERVAL) + self.assertIn("Deadline Exceeded", str(cm.exception)) + self.assertTrue( + triton_client.is_server_live(client_timeout=self.NORMAL_INTERVAL) + ) + + def test_grpc_is_server_ready(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.is_server_ready(client_timeout=self.SMALL_INTERVAL) + self.assertIn("Deadline Exceeded", str(cm.exception)) + self.assertTrue( + triton_client.is_server_ready(client_timeout=self.NORMAL_INTERVAL) + ) + + def test_grpc_is_model_ready(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.is_model_ready( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + self.assertTrue( + triton_client.is_model_ready( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + ) + + def test_grpc_get_server_metadata(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_server_metadata(client_timeout=self.SMALL_INTERVAL) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + triton_client.get_server_metadata(client_timeout=self.NORMAL_INTERVAL) + + def test_grpc_get_model_metadata(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_model_metadata( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_model_metadata( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_model_config(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_model_config( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_model_config( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_model_repository_index(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_model_repository_index( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_model_repository_index(client_timeout=self.NORMAL_INTERVAL) + + def test_grpc_load_model(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + triton_client.unload_model(model_name=self.model_name_) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.load_model( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unload_model( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + triton_client.load_model( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_unload_model(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.unload_model( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.load_model(model_name=self.model_name_) + triton_client.unload_model( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + triton_client.load_model(model_name=self.model_name_) + + def test_grpc_get_inference_statistics(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_inference_statistics( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_inference_statistics( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_update_trace_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.update_trace_settings( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.update_trace_settings( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_trace_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_trace_settings( + model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_trace_settings( + model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_update_log_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + settings = {} + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.update_log_settings( + settings=settings, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.update_log_settings( + settings=settings, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_log_settings(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_log_settings( + as_json=True, client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_log_settings( + as_json=True, client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_system_shared_memory_status(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_system_shared_memory_status( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_system_shared_memory_status( + client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_register_system_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + triton_client.unregister_system_shared_memory() + import tritonclient.utils.shared_memory as shm + + shm_ip0_handle = shm.create_shared_memory_region( + "input0_data", "/input_simple", self.input0_data_byte_size_ + ) + shm.set_shared_memory_region(shm_ip0_handle, [self.input0_data_]) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.register_system_shared_memory( + "input0_data", + "/input_simple", + self.input0_data_byte_size_, + client_timeout=self.SMALL_INTERVAL, + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_system_shared_memory() + triton_client.register_system_shared_memory( + "input0_data", + "/input_simple", + self.input0_data_byte_size_, + client_timeout=self.NORMAL_INTERVAL, + ) + triton_client.unregister_system_shared_memory() + + def test_grpc_unregister_system_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.unregister_system_shared_memory( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_system_shared_memory( + client_timeout=self.NORMAL_INTERVAL + ) + + def test_grpc_get_cuda_shared_memory_status(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.get_cuda_shared_memory_status( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.get_cuda_shared_memory_status(client_timeout=self.NORMAL_INTERVAL) + + def test_grpc_register_cuda_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + import tritonclient.utils.cuda_shared_memory as cshm + + input_data = np.array([[10]], dtype=np.int32) + byteSize = input_data.itemsize * input_data.size + shm_op0_handle = cshm.create_shared_memory_region( + "dummy_data", byte_size=byteSize, device_id=0 + ) + cshm.set_shared_memory_region(shm_op0_handle, [input_data]) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.register_cuda_shared_memory( + "dummy_data", + cshm.get_raw_handle(shm_op0_handle), + device_id=0, + byte_size=byteSize, + client_timeout=self.SMALL_INTERVAL, + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_cuda_shared_memory() + triton_client.register_cuda_shared_memory( + "dummy_data", + cshm.get_raw_handle(shm_op0_handle), + device_id=0, + byte_size=byteSize, + client_timeout=self.NORMAL_INTERVAL, + ) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_grpc_unregister_cuda_shared_memory(self): + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + with self.assertRaises(InferenceServerException) as cm: + _ = triton_client.unregister_cuda_shared_memory( + client_timeout=self.SMALL_INTERVAL + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + triton_client.unregister_cuda_shared_memory(client_timeout=self.NORMAL_INTERVAL) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt b/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt new file mode 100644 index 0000000000..1732ff32fd --- /dev/null +++ b/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt @@ -0,0 +1,54 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_identity_int32" +backend: "identity" +max_batch_size: 1024 +version_policy: { latest { num_versions: 1 }} +instance_group [ { kind: KIND_CPU } ] + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] + +parameters [ + { + key: "execute_delay_ms" + value: { string_value: "3000" } + } +] diff --git a/qa/L0_client_timeout/test.sh b/qa/L0_client_timeout/test.sh new file mode 100755 index 0000000000..f250dc9fa3 --- /dev/null +++ b/qa/L0_client_timeout/test.sh @@ -0,0 +1,272 @@ +#!/bin/bash +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 +TIMEOUT_VALUE=100000000 +SHORT_TIMEOUT_VALUE=1000 +RET=0 + +CLIENT_INFER_TIMEOUT_TEST=client_infer_timeout_test.py +CLIENT_NON_INFER_TIMEOUT_TEST=client_non_infer_timeout_test.py +CLIENT_TIMEOUT_TEST_CPP=../clients/client_timeout_test +TEST_RESULT_FILE='test_results.txt' + +rm -f *.log +rm -f *.log.* + +CLIENT_LOG=`pwd`/client.log +CLIENT_GRPC_TIMEOUTS_LOG=`pwd`/client.log.grpc +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR --model-control-mode=explicit --load-model=custom_identity_int32 --log-verbose 2" +source ../common/util.sh + +mkdir -p $DATADIR/custom_identity_int32/1 + +# Test all APIs apart from Infer. +export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=2 +run_server +if [ $? -eq 1 ]; then + echo -e "\n***\n*** Test Failed: GRPC non-infer APIs\n***" + RET=1 +fi +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Expect timeout for everything +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -p >> ${CLIENT_LOG}.c++.grpc_non_infer_apis 2>&1 +if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_non_infer_apis` != "18" ]; then + cat ${CLIENT_LOG}.c++.grpc_non_infer_apis + echo -e "\n***\n*** Test Failed. Expected 18 failed\n***" + RET=1 +fi +# Test all APIs with long timeout +$CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v -i grpc -p >> ${CLIENT_LOG} 2>&1 +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Test Failed: GRPC non-infer APIs\n***" + RET=1 +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID + +# Test infer APIs +unset TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC +SERVER_ARGS="--model-repository=$DATADIR --log-verbose 2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e + +# CASE 1: Provide too small a timeout and expect a failure. +# Note, the custom_identity_int32 is configured with a delay +# of 3 sec. +# Test request timeout in grpc synchronous inference +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc >> ${CLIENT_LOG}.c++.grpc_infer 2>&1 +if [ $? -eq 0 ]; then + RET=1 +fi +if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_infer` != "1" ]; then + cat ${CLIENT_LOG}.c++.grpc_infer + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# Test request timeout in grpc asynchronous inference +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -a >> ${CLIENT_LOG}.c++.grpc_async_infer 2>&1 +if [ $? -eq 0 ]; then + RET=1 +fi +if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.grpc_async_infer` != "1" ]; then + cat ${CLIENT_LOG}.c++.grpc_async_infer + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# Test stream timeout in grpc asynchronous streaming inference +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -i grpc -s >> ${CLIENT_LOG}.c++.grpc_async_stream_infer 2>&1 +if [ $? -eq 0 ]; then + RET=1 +fi +if [ `grep -c "Stream has been closed" ${CLIENT_LOG}.c++.grpc_async_stream_infer` != "1" ]; then + cat ${CLIENT_LOG}.c++.grpc_async_stream_infer + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# Test request timeout in http synchronous inference +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v >> ${CLIENT_LOG}.c++.http_infer 2>&1 +if [ $? -eq 0 ]; then + RET=1 +fi +if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.http_infer` == "0" ]; then + cat ${CLIENT_LOG}.c++.http_infer + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + + +# Test request timeout in http asynchronous inference +$CLIENT_TIMEOUT_TEST_CPP -t $SHORT_TIMEOUT_VALUE -v -a >> ${CLIENT_LOG}.c++.http_async_infer 2>&1 +if [ $? -eq 0 ]; then + RET=1 +fi +if [ `grep -c "Deadline Exceeded" ${CLIENT_LOG}.c++.http_async_infer` == "0" ]; then + cat ${CLIENT_LOG}.c++.http_async_infer + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 1 ]; then + # Return if CASE 1 failed + kill $SERVER_PID + wait $SERVER_PID + exit $RET +fi + + +# CASE 2: Provide sufficiently large timeout value +set +e + +echo "TEST: GRPC Synchronous" >> ${CLIENT_LOG} +$CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v -i grpc >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed: GRPC Synchronous\n***" + RET=1 +fi + +echo "TEST: GRPC Asynchronous" >> ${CLIENT_LOG} +$CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v -i grpc -a >> ${CLIENT_LOG}.c++.grpc_async_infer 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed: GRPC Asynchronous\n***" + RET=1 +fi + +echo "TEST: GRPC Streaming" >> ${CLIENT_LOG} +$CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v -i grpc -s >> ${CLIENT_LOG}.c++.grpc_async_stream_infer 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed: GRPC Streaming\n***" + RET=1 +fi + +echo "TEST: HTTP Synchronous" >> ${CLIENT_LOG} +$CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v >> ${CLIENT_LOG}.c++.http_infer 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed: HTTP Synchronous\n***" + RET=1 +fi + +echo "TEST: HTTP Asynchronous" >> ${CLIENT_LOG} +$CLIENT_TIMEOUT_TEST_CPP -t $TIMEOUT_VALUE -v -a >> ${CLIENT_LOG}.c++.http_async_infer 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed: HTTP Asynchronous\n***" + RET=1 +fi + +echo "TEST: Python Library" >> ${CLIENT_LOG} + +# CASE 3: Python Library + +for i in test_grpc_infer \ + test_grpc_async_infer \ + test_grpc_stream_infer \ + test_http_infer \ + test_http_async_infer \ + ; do + python $CLIENT_INFER_TIMEOUT_TEST ClientInferTimeoutTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi +done + +set -e +kill $SERVER_PID +wait $SERVER_PID + +# Test all APIs other than infer +export TRITONSERVER_SERVER_DELAY_GRPC_RESPONSE_SEC=2 +SERVER_ARGS="${SERVER_ARGS} --model-control-mode=explicit --load-model=custom_identity_int32 --log-verbose 2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e + +python $CLIENT_NON_INFER_TIMEOUT_TEST >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat ${CLIENT_LOG} + echo -e "\n***\n*** Test FAILED\n***" +fi + +set +e +exit $RET diff --git a/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt b/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt new file mode 100644 index 0000000000..6a2a76bde5 --- /dev/null +++ b/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_identity_int32" +backend: "identity" +max_batch_size: 1024 +version_policy: { latest { num_versions: 1 }} +instance_group [ { kind: KIND_CPU } ] + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] \ No newline at end of file diff --git a/qa/L0_client_valgrind/test.sh b/qa/L0_client_valgrind/test.sh new file mode 100755 index 0000000000..0870aa883c --- /dev/null +++ b/qa/L0_client_valgrind/test.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# Must run on a single device or else the TRITONSERVER_DELAY_SCHEDULER +# can fail when the requests are distributed to multiple devices. +export CUDA_VISIBLE_DEVICES=0 + +LEAKCHECK=/usr/bin/valgrind +LEAKCHECK_ARGS_BASE="--leak-check=full --show-leak-kinds=definite --max-threads=3000" +SERVER_TIMEOUT=3600 +rm -f *.log + +MEMORY_GROWTH_TEST=../clients/memory_leak_test + +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +source ../common/util.sh + +mkdir -p $DATADIR/custom_identity_int32/1 + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Run test for both HTTP and GRPC, re-using and not re-using client object. +# 1000 inferences in each case. +EXTRA_ARGS="-r 1000" +for PROTOCOL in http grpc; do + for REUSE in reuse no_reuse; do + LEAKCHECK_LOG="./valgrind.${PROTOCOL}.${REUSE}.c++.log" + CLIENT_LOG="./client.${PROTOCOL}.${REUSE}.c++.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + if [ "$REUSE" == "reuse" ]; then + EXTRA_CLIENT_ARGS="${EXTRA_ARGS} -i ${PROTOCOL} -R" + else + EXTRA_CLIENT_ARGS="${EXTRA_ARGS} -i ${PROTOCOL}" + fi + + $LEAKCHECK $LEAKCHECK_ARGS $MEMORY_GROWTH_TEST $EXTRA_CLIENT_ARGS >> ${CLIENT_LOG} 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG} + RET=1 + echo -e "\n***\n*** Test FAILED\n***" + else + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Memory leak detected\n***" + RET=1 + fi + fi + done +done + +# Stop Server +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_cmdline_trace/test.sh b/qa/L0_cmdline_trace/test.sh new file mode 100755 index 0000000000..d0f86dc2a9 --- /dev/null +++ b/qa/L0_cmdline_trace/test.sh @@ -0,0 +1,722 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ============================= Helpers ======================================= +function assert_server_startup_failed() { + if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n***Fail: Server start should have failed $SERVER\n***" + cat $SERVER_LOG + set -e + kill $SERVER_PID + wait $SERVER_PID + set +e + exit 1 + fi +} + +TRACE_SUMMARY=../common/trace_summary.py +CLIENT_SCRIPT=trace_client.py + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +ENSEMBLEDIR=$DATADIR/../qa_ensemble_model_repository/qa_model_repository/ +MODELBASE=onnx_int32_int32_int32 + +MODELSDIR=`pwd`/trace_models + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +rm -f *.log +rm -fr $MODELSDIR && mkdir -p $MODELSDIR + +# set up simple model using MODELBASE, this test needs gradually update as +# backends are ported to use backend API as backend API not yet support tracing. +rm -fr $MODELSDIR && mkdir -p $MODELSDIR && \ + cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \ + rm -r $MODELSDIR/simple/2 && rm -r $MODELSDIR/simple/3 && \ + (cd $MODELSDIR/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt) + +RET=0 + +# trace-level=OFF make sure no tracing +SERVER_ARGS="--trace-file=trace_off.log --trace-level=OFF --trace-rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_off.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +for p in {1..10}; do + python3 $CLIENT_SCRIPT -i grpc -u localhost:8001 >> client_off.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_off.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +if [ -f ./trace_off.log ]; then + echo -e "\n***\n*** Test Failed, unexpected generation of trace_off.log\n***" + RET=1 +fi + +set -e + +# trace-rate == 1, trace-level=MIN make sure every request is traced +SERVER_ARGS="--trace-file=trace_min.log --trace-level=MIN --trace-rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_min.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +for p in {1..10}; do + python3 $CLIENT_SCRIPT -i grpc -u localhost:8001 >> client_min.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_min.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t trace_min.log > summary_min.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_min.log` != "20" ]; then + cat summary_min.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_min.log` != "20" ]; then + cat summary_min.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# trace-rate == 9, trace-level=MAX +SERVER_ARGS="--http-thread-count=1 --trace-file=trace_max.log \ + --trace-level=MAX --trace-rate=9 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_max.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +for p in {1..10}; do + python3 $CLIENT_SCRIPT -i grpc -u localhost:8001 >> client_max.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_max.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t trace_max.log > summary_max.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_max.log` != "2" ]; then + cat summary_max.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_max.log` != "2" ]; then + cat summary_max.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# trace-rate == 1, trace-level=TIMESTAMPS make sure every request is traced +SERVER_ARGS="--trace-file=trace_1.log --trace-level=TIMESTAMPS --trace-rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_1.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +for p in {1..10}; do + python3 $CLIENT_SCRIPT -i grpc -u localhost:8001 >> client_1.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_1.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t trace_1.log > summary_1.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_1.log` != "20" ]; then + cat summary_1.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_1.log` != "20" ]; then + cat summary_1.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# trace-rate == 6, trace-level=TIMESTAMPS +SERVER_ARGS="--http-thread-count=1 --trace-file=trace_6.log \ + --trace-level=TIMESTAMPS --trace-rate=6 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_6.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +for p in {1..10}; do + python3 $CLIENT_SCRIPT -i grpc -u localhost:8001 >> client_6.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_6.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t trace_6.log > summary_6.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_6.log` != "3" ]; then + cat summary_6.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_6.log` != "3" ]; then + cat summary_6.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# trace-rate == 6, trace-level=TIMESTAMPS, trace-log-frequency == 2 +SERVER_ARGS="--http-thread-count=1 --trace-file=trace_frequency.log \ + --trace-level=TIMESTAMPS --trace-rate=6 \ + --trace-log-frequency=2 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_frequency.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +for p in {1..10}; do + python3 $CLIENT_SCRIPT -i grpc -u localhost:8001 >> client_frequency.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_frequency.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +# Two trace files +$TRACE_SUMMARY -t trace_frequency.log.0 > summary_frequency.log.0 +if [ `grep -c "COMPUTE_INPUT_END" summary_frequency.log.0` != "2" ]; then + cat summary_frequency.log.0 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_frequency.log.0` != "2" ]; then + cat summary_frequency.log.0 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$TRACE_SUMMARY -t trace_frequency.log.1 > summary_frequency.log.1 +if [ `grep -c "COMPUTE_INPUT_END" summary_frequency.log.1` != "1" ]; then + cat summary_frequency.log.1 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_frequency.log.1` != "1" ]; then + cat summary_frequency.log.1 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# trace-rate == 9, trace-level=TIMESTAMPS +SERVER_ARGS="--http-thread-count=1 --trace-file=trace_9.log \ + --trace-level=TIMESTAMPS --trace-rate=9 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_9.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +for p in {1..10}; do + python3 $CLIENT_SCRIPT -i grpc -u localhost:8001 >> client_9.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_9.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t trace_9.log > summary_9.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_9.log` != "2" ]; then + cat summary_9.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_9.log` != "2" ]; then + cat summary_9.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# Demonstrate trace for ensemble +# set up "addsub" nested ensemble +rm -fr $MODELSDIR && mkdir -p $MODELSDIR && \ + cp -r $DATADIR/$MODELBASE $MODELSDIR/$MODELBASE && \ + rm -r $MODELSDIR/$MODELBASE/2 && rm -r $MODELSDIR/$MODELBASE/3 + +# nested ensemble +mkdir -p $MODELSDIR/fan_$MODELBASE/1 && \ + cp $ENSEMBLEDIR/fan_$MODELBASE/config.pbtxt $MODELSDIR/fan_$MODELBASE/. && \ + (cd $MODELSDIR/fan_$MODELBASE && \ + sed -i "s/label_filename:.*//" config.pbtxt) + +mkdir -p $MODELSDIR/simple/1 && \ + cp $ENSEMBLEDIR/fan_$MODELBASE/config.pbtxt $MODELSDIR/simple/. && \ + (cd $MODELSDIR/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt && \ + sed -i "s/$MODELBASE/fan_$MODELBASE/" config.pbtxt && \ + sed -i "s/label_filename:.*//" config.pbtxt) + +cp -r $ENSEMBLEDIR/nop_TYPE_INT32_-1 $MODELSDIR/. && \ + mkdir -p $MODELSDIR/nop_TYPE_INT32_-1/1 + +# trace-rate == 1, trace-level=TIMESTAMPS +SERVER_ARGS="--http-thread-count=1 --trace-file=trace_ensemble.log \ + --trace-level=TIMESTAMPS --trace-rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_ensemble.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_ensemble.log 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t trace_ensemble.log > summary_ensemble.log + +# Check if the traces are captured with proper hierarchy +if [ `grep -c "COMPUTE_INPUT_END" summary_ensemble.log` != "7" ]; then + echo -e "Ensemble trace log expects 7 compute" + RET=1 +fi + +for trace_str in \ + "{\"id\":1,\"model_name\":\"simple\",\"model_version\":1,\"request_id\":\"1\"}" \ + "{\"id\":2,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" \ + "{\"id\":3,\"model_name\":\"fan_${MODELBASE}\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" \ + "{\"id\":4,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":5,\"model_name\":\"${MODELBASE}\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":6,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":7,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":8,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" \ + "{\"id\":9,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" ; do + if [ `grep -c ${trace_str} trace_ensemble.log` != "1" ]; then + echo -e "Ensemble trace log expects trace: ${trace_str}" + RET=1 + fi +done + +if [ `grep -c ^simple summary_ensemble.log` != "1" ]; then + cat summary_ensemble.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + + +# trace-rate == 1, trace-level=TIMESTAMPS, trace-level=TENSORS +SERVER_ARGS="--http-thread-count=1 --trace-file=trace_ensemble_tensor.log \ + --trace-level=TIMESTAMPS --trace-level=TENSORS --trace-rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_ensemble_tensor.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python3 $CLIENT_SCRIPT -i http -u localhost:8000 >> client_ensemble_tensor.log 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t trace_ensemble_tensor.log > summary_ensemble_tensor.log + +# Check if the traces are captured with proper hierarchy +if [ `grep -c "COMPUTE_INPUT_END" summary_ensemble_tensor.log` != "7" ]; then + echo -e "Ensemble trace tensors log expects 7 compute" + RET=1 +fi +for trace_str in \ + "{\"id\":1,\"model_name\":\"simple\",\"model_version\":1,\"request_id\":\"1\"}" \ + "{\"id\":2,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" \ + "{\"id\":3,\"model_name\":\"fan_${MODELBASE}\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" \ + "{\"id\":4,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":5,\"model_name\":\"${MODELBASE}\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":6,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":7,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":3}" \ + "{\"id\":8,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" \ + "{\"id\":9,\"model_name\":\"nop_TYPE_INT32_-1\",\"model_version\":1,\"request_id\":\"1\",\"parent_id\":1}" ; do + if [ `grep -c ${trace_str} trace_ensemble_tensor.log` != "1" ]; then + echo -e "Ensemble trace tensors log expects trace: ${trace_str}" + RET=1 + fi +done + +if [ `grep -c ^simple summary_ensemble_tensor.log` != "1" ]; then + cat summary_ensemble_tensor.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -o TENSOR_QUEUE_INPUT trace_ensemble_tensor.log | wc -l` != "18" ]; then + echo -e "Ensemble trace tensors log expects 18 TENSOR_QUEUE_INPUTs" + RET=1 +fi + +if [ `grep -o TENSOR_BACKEND_OUTPUT trace_ensemble_tensor.log | wc -l` != "14" ]; then + echo -e "Ensemble trace tensors log expects 14 TENSOR_BACKEND_OUTPUTs" + RET=1 +fi + +for trace_str in \ + "{\"id\":1,\"activity\":\"TENSOR_QUEUE_INPUT\",\"tensor\":{\"name\":\"INPUT0\",\"data\":\"0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15\",\"shape\":\"1,16\",\"dtype\":\"INT32\"}}" \ + "{\"id\":1,\"activity\":\"TENSOR_QUEUE_INPUT\",\"tensor\":{\"name\":\"INPUT1\",\"data\":\"1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1\",\"shape\":\"1,16\",\"dtype\":\"INT32\"}}" \ + "{\"id\":1,\"activity\":\"TENSOR_BACKEND_OUTPUT\",\"tensor\":{\"name\":\"OUTPUT0\",\"data\":\"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16\",\"shape\":\"1,16\",\"dtype\":\"INT32\"}}" \ + "{\"id\":1,\"activity\":\"TENSOR_BACKEND_OUTPUT\",\"tensor\":{\"name\":\"OUTPUT1\",\"data\":\"-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14\",\"shape\":\"1,16\",\"dtype\":\"INT32\"}}" ; do + if [ `grep -c ${trace_str} trace_ensemble_tensor.log` != "1" ]; then + echo -e "Ensemble trace tensors log expects trace: ${trace_str}" + RET=1 + fi +done + +set -e + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + + +# check deprecation warnings +SERVER_ARGS=" --trace-file=/tmp/trace.json --trace-rate=100 --trace-level=TIMESTAMPS \ + --trace-log-frequency=50 --trace-count=100 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_trace_config_flag.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +if [ `grep -c "Warning: '--trace-file' has been deprecated" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c "Warning: '--trace-rate' has been deprecated" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c "Warning: '--trace-level' has been deprecated" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c "Warning: '--trace-log-frequency' has been deprecated" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c "Warning: '--trace-count' has been deprecated" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +################################################################################ +# The following set of tests checks that tritonserver gracefully handles # +# bad OpenTelemetry BatchSpanProcessor parameters, provided through # +# environment variables, or tritonserver's options. # +################################################################################ +export OTEL_BSP_MAX_QUEUE_SIZE="bad_value" + +SERVER_ARGS="--trace-config mode=opentelemetry --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_trace_config_flag.log" +run_server +assert_server_startup_failed + +if [ `grep -c "Bad option: \"OTEL_BSP_MAX_QUEUE_SIZE\"" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +unset OTEL_BSP_MAX_QUEUE_SIZE + +export OTEL_BSP_SCHEDULE_DELAY="bad_value" +run_server +assert_server_startup_failed + +if [ `grep -c "Bad option: \"OTEL_BSP_SCHEDULE_DELAY\"" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +unset OTEL_BSP_SCHEDULE_DELAY + +export OTEL_BSP_MAX_EXPORT_BATCH_SIZE="bad_value" +run_server +assert_server_startup_failed + +if [ `grep -c "Bad option: \"OTEL_BSP_MAX_EXPORT_BATCH_SIZE\"" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +unset OTEL_BSP_MAX_EXPORT_BATCH_SIZE + +SERVER_ARGS="--model-repository=$MODELSDIR --trace-config mode=opentelemetry \ + --trace-config opentelemetry,bsp_max_queue_size=bad_value" +SERVER_LOG="./inference_server_trace_config_flag.log" +run_server +assert_server_startup_failed + +if [ `grep -c "Bad option: \"--trace-config opentelemetry,bsp_max_queue_size\"" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +SERVER_ARGS="--model-repository=$MODELSDIR --trace-config mode=opentelemetry \ + --trace-config opentelemetry,bsp_schedule_delay=bad_value" +SERVER_LOG="./inference_server_trace_config_flag.log" +run_server +assert_server_startup_failed + +if [ `grep -c "Bad option: \"--trace-config opentelemetry,bsp_schedule_delay\"" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +SERVER_ARGS="--model-repository=$MODELSDIR --trace-config mode=opentelemetry \ + --trace-config opentelemetry,bsp_max_export_batch_size=bad_value" +SERVER_LOG="./inference_server_trace_config_flag.log" +run_server +assert_server_startup_failed + +if [ `grep -c "Bad option: \"--trace-config opentelemetry,bsp_max_export_batch_size\"" $SERVER_LOG` != "1" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_cmdline_trace/trace_client.py b/qa/L0_cmdline_trace/trace_client.py new file mode 100755 index 0000000000..4d59579d7c --- /dev/null +++ b/qa/L0_cmdline_trace/trace_client.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8001", + help="Inference server URL. Default is localhost:8001.", + ) + parser.add_argument("-i", "--protocol", type=str, required=True) + FLAGS = parser.parse_args() + + if FLAGS.protocol == "grpc": + client_type = grpcclient + else: + client_type = httpclient + + try: + triton_client = client_type.InferenceServerClient(url=FLAGS.url) + except Exception as e: + print("channel creation failed: " + str(e)) + sys.exit() + + model_name = "simple" + + # Infer + inputs = [] + outputs = [] + inputs.append(client_type.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(client_type.InferInput("INPUT1", [1, 16], "INT32")) + + input0_data = np.arange(start=0, stop=16, dtype=np.int32) + input0_data = np.expand_dims(input0_data, axis=0) + input1_data = np.ones(shape=(1, 16), dtype=np.int32) + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + outputs.append(client_type.InferRequestedOutput("OUTPUT0")) + outputs.append(client_type.InferRequestedOutput("OUTPUT1")) + + triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs, request_id="1" + ) diff --git a/qa/L0_compute_capability/test.sh b/qa/L0_compute_capability/test.sh new file mode 100755 index 0000000000..d85acb1b6e --- /dev/null +++ b/qa/L0_compute_capability/test.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +source ../common/util.sh + +rm -f *.log + +RET=0 + +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch plan"} + +for BACKEND in $BACKENDS; do + # Need just one model for the backend... + rm -fr models && mkdir models + cp -r ${DATADIR}/qa_model_repository/${BACKEND}_float32_float32_float32 \ + models/. + + if [ "$BACKEND" != "plan" ]; then + for MC in `ls models/*/config.pbtxt`; do + echo "instance_group [ { kind: KIND_GPU }]" >> $MC + done + fi + + # Run with a high minimum capability so that no GPUs are + # recognized. This should cause the server to fail to start since + # we explicitly asked for a GPU in the instance_group. + SERVER_ARGS="--min-supported-compute-capability=100.0 --model-repository=`pwd`/models" + SERVER_LOG="./inference_server_${BACKEND}_cc100.log" + run_server + if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected success with min compute 100.0 for ${BACKEND}\n***" + RET=1 + + kill $SERVER_PID + wait $SERVER_PID + fi + + # Run with a low minimum capability and make sure GPUs are + # recognized. + SERVER_ARGS="--min-supported-compute-capability=1.0 --model-repository=`pwd`/models" + SERVER_LOG="./inference_server_${BACKEND}_cc1.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Unexpected failure with min compute 1.0 for ${BACKEND}\n***" + RET=1 + else + kill $SERVER_PID + wait $SERVER_PID + fi +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_config_json/ensemble_config.pbtxt b/qa/L0_config_json/ensemble_config.pbtxt new file mode 100644 index 0000000000..29de01a3aa --- /dev/null +++ b/qa/L0_config_json/ensemble_config.pbtxt @@ -0,0 +1,105 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple_ensemble" +platform: "ensemble" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "savedmodel_nobatch_float32_float32_float32" + model_version: 1 + input_map [ + { + key : "INPUT0" + value : "INPUT0" + }, + { + key : "INPUT1" + value : "INPUT1" + } + ] + output_map [ + { + key : "OUTPUT0" + value : "out0" + }, + { + key : "OUTPUT1" + value : "out1" + } + ] + }, + { + model_name: "savedmodel_nobatch_float32_float32_float32" + model_version: -1 + input_map [ + { + key : "INPUT0" + value : "out0" + }, + { + key : "INPUT1" + value : "out1" + } + ] + output_map [ + { + key : "OUTPUT0" + value : "OUTPUT0" + }, + { + key : "OUTPUT1" + value : "OUTPUT1" + } + ] + } + ] +} diff --git a/qa/L0_config_json/max_priority_level.pbtxt b/qa/L0_config_json/max_priority_level.pbtxt new file mode 100644 index 0000000000..f71f08d236 --- /dev/null +++ b/qa/L0_config_json/max_priority_level.pbtxt @@ -0,0 +1,62 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "max_priority_level" +backend: "identity" +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +dynamic_batching: +{ + # Max uint64 + priority_levels: 18446744073709551615 + # Max uint32 + default_priority_level: 4294967295 + # Max uint32 + 1 + priority_queue_policy: [ + {key: 4294967296 + value: { + timeout_action: REJECT + default_timeout_microseconds: 18446744073709551615 + allow_timeout_override: true + max_queue_size: 10 + } + } +] +} \ No newline at end of file diff --git a/qa/L0_config_json/test.sh b/qa/L0_config_json/test.sh new file mode 100755 index 0000000000..b1016b806b --- /dev/null +++ b/qa/L0_config_json/test.sh @@ -0,0 +1,428 @@ +#!/bin/bash +# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +DATADIR="/data/inferenceserver/${REPO_VERSION}" +CLIENT_LOG="./client.log" +SERVER_LOG="./inference_server.log" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +source ../common/util.sh + +RET=0 +rm -fr *.log + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. + +# Test input and output dims are shown as numbers +TRIAL=ios + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/savedmodel_nobatch_float32_float32_float32/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +matches=`grep -o "\"dims\":\[16\]" $TRIAL.out | wc -l` +if [ $matches -ne 4 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 4 dims, got $matches\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Test input and output reshape are shown as numbers +TRIAL=reshape + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + sed -i "s/data_type:.*TYPE_FP32/data_type: TYPE_FP32\nreshape: { shape: [ 16 ]}/g" config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/savedmodel_nobatch_float32_float32_float32/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +matches=`grep -o "\"reshape\":{\"shape\":\[16\]}" $TRIAL.out | wc -l` +if [ $matches -ne 4 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 4 reshape:shape, got $matches\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Test version_policy::specific +TRIAL=specific + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + sed -i "s/^version_policy:.*/version_policy: { specific: { versions: [1] }}/" config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/savedmodel_nobatch_float32_float32_float32/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +matches=`grep -o "\"version_policy\":{\"specific\":{\"versions\":\[1\]}}" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 version_policy:specific:versions, got $matches\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Test dynamic_batching::max_queue_delay_microseconds, +# dynamic_batching::default_queue_policy::default_timeout_microseconds, +# dynamic_batching::priority_queue_policy::value::default_timeout_microseconds +TRIAL=dbatch + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + echo "dynamic_batching: { max_queue_delay_microseconds: 42 \ + default_queue_policy: { default_timeout_microseconds: 123 } \ + priority_queue_policy: { key: 1 value: { default_timeout_microseconds: 123 }} \ + priority_queue_policy: { key: 2 value: { default_timeout_microseconds: 123 }}}" >> config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/savedmodel_nobatch_float32_float32_float32/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +matches=`grep -o "\"dynamic_batching\":{" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 dynamic_batching, got $matches\n***" + RET=1 +fi + +matches=`grep -o "\"max_queue_delay_microseconds\":42" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 dynamic_batching:max_queue_delay_microseconds, got $matches\n***" + RET=1 +fi + +matches=`grep -o "\"default_timeout_microseconds\":123" $TRIAL.out | wc -l` +if [ $matches -ne 3 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 3 dynamic_batching:*_queue_policy:default_timeout_microseconds, got $matches\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Test sequence_batching::oldest::max_queue_delay_microseconds, +# sequence_batching::max_sequence_idle_microseconds +TRIAL=sbatch + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + echo "sequence_batching: { max_sequence_idle_microseconds: 42 \ + oldest: { max_queue_delay_microseconds: 987 }}" >> config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/savedmodel_nobatch_float32_float32_float32/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +matches=`grep -o "\"sequence_batching\":{" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 sequence_batching, got $matches\n***" + RET=1 +fi + +matches=`grep -o "\"max_sequence_idle_microseconds\":42" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 sequence_batching:max_sequence_idle_microseconds, got $matches\n***" + RET=1 +fi + +matches=`grep -o "\"oldest\":{" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 sequence_batching:oldest, got $matches\n***" + RET=1 +fi + +matches=`grep -o "\"max_queue_delay_microseconds\":987" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 sequence_batching:oldest:max_queue_delay_microseconds, got $matches\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Test ensemble_scheduling::step::model_version +TRIAL=ensemble + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. +mkdir -p models/simple_ensemble/1 && cp ensemble_config.pbtxt models/simple_ensemble/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/simple_ensemble/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +matches=`grep -o "\"model_version\":1" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 ensemble_scheduling:step:model_version == 1, got $matches\n***" + RET=1 +fi + +matches=`grep -o "\"model_version\":-1" $TRIAL.out | wc -l` +if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 ensemble_scheduling:step:model_version == -1, got $matches\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +rm -fr models/simple_ensemble + +# Test model_warmup::inputs::value::dims +TRIAL=warmup + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + echo "model_warmup [{" >> config.pbtxt && \ + echo " name : \"warmup 1\"" >> config.pbtxt && \ + echo " batch_size: 1" >> config.pbtxt && \ + echo " inputs [{" >> config.pbtxt && \ + echo " key: \"INPUT0\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }, {" >> config.pbtxt && \ + echo " key: \"INPUT1\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " random_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }]" >> config.pbtxt && \ + echo " }, {" >> config.pbtxt && \ + echo " name : \"warmup 2\"" >> config.pbtxt && \ + echo " batch_size: 1" >> config.pbtxt && \ + echo " inputs [{" >> config.pbtxt && \ + echo " key: \"INPUT0\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }, {" >> config.pbtxt && \ + echo " key: \"INPUT1\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " random_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }]" >> config.pbtxt && \ + echo " }]" >> config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/savedmodel_nobatch_float32_float32_float32/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +matches=`grep -o "\"dims\":\[16\]" $TRIAL.out | wc -l` +if [ $matches -ne 8 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 8 model_warmup:inputs:dims, got $matches\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Test max_priority_level +TRIAL=max_priority_level + +rm -fr models && mkdir models +mkdir -p models/max_priority_level/1 && cp max_priority_level.pbtxt models/max_priority_level/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/max_priority_level/config` +set -e +if [ "$code" != "200" ]; then + cat $TRIAL.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +declare -A expected_values + +MAX_UINT64=18446744073709551615 +MAX_UINT32=4294967295 +MAX_UINT32_PLUS_1=4294967296 + +expected_values["priority_levels"]=$MAX_UINT64 +expected_values["default_priority_level"]=$MAX_UINT32 +expected_values[$MAX_UINT32_PLUS_1]=\{\"timeout_action\":\"REJECT\",\"default_timeout_microseconds\":18446744073709551615,\"allow_timeout_override\":true,\"max_queue_size\":10\} +expected_values["default_timeout_microseconds"]=$MAX_UINT64 + +for key in "${!expected_values[@]}"; do + value=${expected_values[$key]} + matches=`grep -o "\"$key\":$value" $TRIAL.out | wc -l` + if [ $matches -ne 1 ]; then + cat $TRIAL.out + echo -e "\n***\n*** Expected 1 $key == $value, got $matches\n***" + RET=1 + fi +done + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_cuda_graph/test.sh b/qa/L0_cuda_graph/test.sh new file mode 100755 index 0000000000..e1bfe2057f --- /dev/null +++ b/qa/L0_cuda_graph/test.sh @@ -0,0 +1,344 @@ +#!/bin/bash +# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +TRT_CUDA_GRAPH_TEST=trt_cuda_graph_test.py +TEST_RESULT_FILE='test_results.txt' +DATADIR="./models" + +rm -rf ${DATADIR} +mkdir -p ${DATADIR} + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--log-verbose=1 --model-repository=$DATADIR --strict-model-config=true" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f *.log* + +RET=0 + +# TrtCudaGraphTest.test_fixed_shape +rm -rf ${DATADIR} && mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/plan_float32_float32_float32 ${DATADIR}/ +# Make sure only one version is present +rm -rf ${DATADIR}/plan_float32_float32_float32/3 + +CLIENT_LOG="./fixed_shape.client.log" +SERVER_LOG="./fixed_shape.inference_server.log" +echo "optimization { cuda { graphs: true } }" >> ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_CUDA_GRAPH_TEST TrtCudaGraphTest.test_fixed_shape>>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +if [ `grep -c "Context with profile default \[0\] is being executed for " $SERVER_LOG` != "1" ]; then + echo -e "\n***\n*** Failed. Expected only one execution without CUDA graph\n***" + RET=1 +fi + +if [ `grep -c "captured CUDA graph for" $SERVER_LOG` != "6" ]; then + echo -e "\n***\n*** Failed. Expected 6 CUDA graphs are captured\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# TrtCudaGraphTest.test_dynamic_shape +# plan_float32_float32_float32 models with dynamic shapes has 6 profiles +# min, opt, max, idx +# [1, 1], [1, 16], [8, 33], 0 (*) +# [1, 1], [2, 16], [7, 32], 1 +# [1, 1], [3, 16], [6, 32], 2 +# [1, 1], [4, 16], [5, 32], 3 +# [5, 1], [6, 16], [8, 32], 4 (*) +# [6, 1], [6, 16], [8, 32], 5 (*) +# [1, 1], [1, 16], [8, 32], 6 +rm -rf ${DATADIR} && mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_float32_float32_float32 ${DATADIR}/ + +SERVER_ARGS="--log-verbose=1 --model-repository=$DATADIR --strict-model-config=true" +CLIENT_LOG="./dynamic_shape.client.log" +SERVER_LOG="./dynamic_shape.inference_server.log" +sed -i "s/profile:.*/profile: [\"0\"]/" ${DATADIR}/plan_float32_float32_float32/config.pbtxt +echo "optimization { cuda { graphs: true } }" >> ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_CUDA_GRAPH_TEST TrtCudaGraphTest.test_dynamic_shape>>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +if [ `grep -c "Context with profile 0 \[0\] is being executed for " $SERVER_LOG` != "2" ]; then + echo -e "\n***\n*** Failed. Expected 2 execution without CUDA graph\n***" + RET=1 +fi + +if [ `grep -c "captured CUDA graph for" $SERVER_LOG` != "6" ]; then + echo -e "\n***\n*** Failed. Expected 6 CUDA graphs are captured\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# TrtCudaGraphTest.test_range_fixed_shape +rm -rf ${DATADIR} && mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/plan_float32_float32_float32 ${DATADIR}/ +# Make sure only one version is present +rm -rf ${DATADIR}/plan_float32_float32_float32/3 + +SERVER_ARGS="--log-verbose=1 --model-repository=$DATADIR" +CLIENT_LOG="./range_fixed_shape.client.log" +SERVER_LOG="./range_fixed_shape.inference_server.log" +echo "optimization { \ + cuda { \ + graphs: true \ + graph_spec [ { \ + batch_size: 4 \ + graph_lower_bound { \ + batch_size: 2 \ + } \ +} ] } }" >> ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_CUDA_GRAPH_TEST TrtCudaGraphTest.test_range_fixed_shape>>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +if [ `grep -c "Context with profile default \[0\] is being executed for " $SERVER_LOG` != "3" ]; then + echo -e "\n***\n*** Failed. Expected only 3 execution without CUDA graph\n***" + RET=1 +fi + +if [ `grep -c "captured CUDA graph for" $SERVER_LOG` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 CUDA graphs are captured\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# TrtCudaGraphTest.test_range_dynamic_shape +# plan_float32_float32_float32 models with dynamic shapes has 6 profiles +# min, opt, max, idx +# [1, 1], [1, 16], [8, 33], 0 (*) +# [1, 1], [2, 16], [7, 32], 1 +# [1, 1], [3, 16], [6, 32], 2 +# [1, 1], [4, 16], [5, 32], 3 +# [5, 1], [6, 16], [8, 32], 4 (*) +# [6, 1], [6, 16], [8, 32], 5 (*) +# [1, 1], [1, 16], [8, 32], 6 +rm -rf ${DATADIR} && mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_float32_float32_float32 ${DATADIR}/ + +CLIENT_LOG="./range_dynamic_shape.client.log" +SERVER_LOG="./range_dynamic_shape.inference_server.log" +sed -i "s/profile:.*/profile: [\"0\"]/" ${DATADIR}/plan_float32_float32_float32/config.pbtxt +echo "optimization { \ + cuda { \ + graphs: true \ + graph_spec [ { \ + batch_size: 4 \ + input { key: \"INPUT0\" value: {dim : [16]} } \ + input { key: \"INPUT1\" value: {dim : [16]} } \ + graph_lower_bound { \ + batch_size: 2 \ + input { key: \"INPUT0\" value: {dim : [8]} } \ + input { key: \"INPUT1\" value: {dim : [8]} } \ + } \ +} ] } }" >> ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_CUDA_GRAPH_TEST TrtCudaGraphTest.test_range_dynamic_shape>>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +if [ `grep -c "Context with profile 0 \[0\] is being executed for " $SERVER_LOG` != "4" ]; then + echo -e "\n***\n*** Failed. Expected 4 execution without CUDA graph\n***" + RET=1 +fi + +if [ `grep -c "captured CUDA graph for" $SERVER_LOG` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 CUDA graphs are captured\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# TrtCudaGraphTest.test_nobatch_fixed_shape +rm -rf ${DATADIR} && mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/plan_nobatch_float32_float32_float32 ${DATADIR}/ +# Make sure only one version is present +rm -rf ${DATADIR}/plan_nobatch_float32_float32_float32/2 ${DATADIR}/plan_nobatch_float32_float32_float32/3 + +CLIENT_LOG="./nobatch_fixed_shape.client.log" +SERVER_LOG="./nobatch_fixed_shape.inference_server.log" +echo "optimization { cuda { graphs: true } }" >> ${DATADIR}/plan_nobatch_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_CUDA_GRAPH_TEST TrtCudaGraphTest.test_nobatch_fixed_shape plan_nobatch>>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +if [ `grep -c "Context with profile default \[0\] is launching CUDA graph " $SERVER_LOG` != "0" ]; then + echo -e "\n***\n*** Failed. Expected 0 execution with CUDA graph\n***" + RET=1 +fi + +if [ `grep -c "captured CUDA graph for" $SERVER_LOG` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 CUDA graph to be captured\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_cuda_graph/trt_cuda_graph_test.py b/qa/L0_cuda_graph/trt_cuda_graph_test.py new file mode 100755 index 0000000000..c77ee5e5f4 --- /dev/null +++ b/qa/L0_cuda_graph/trt_cuda_graph_test.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu +from tritonclient.utils import * + + +class TrtCudaGraphTest(tu.TestResultCollector): + MODELNAME = "plan" + + def setUp(self): + self.dtype_ = np.float32 + self.dtype_str_ = "FP32" + self.model_name_ = self.MODELNAME + + def _check_infer(self, tensor_shape, batch_size=1): + try: + if batch_size: + full_shape = (batch_size,) + tensor_shape + else: + full_shape = tensor_shape + iu.infer_exact( + self, + self.model_name_, + full_shape, + batch_size, + self.dtype_, + self.dtype_, + self.dtype_, + model_version=1, + use_http_json_tensors=False, + use_grpc=False, + use_streaming=False, + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def _erroneous_infer(self, tensor_shape, batch_size): + import tritonhttpclient + + item_size = batch_size + for dim in tensor_shape: + item_size *= dim + full_shape = (batch_size,) + tensor_shape + input_np = np.arange(item_size, dtype=self.dtype_).reshape(full_shape) + expected_output0_np = input_np + input_np + expected_output1_np = input_np - input_np + + inputs = [] + inputs.append( + tritonhttpclient.InferInput("INPUT0", full_shape, self.dtype_str_) + ) + inputs[-1].set_data_from_numpy(input_np) + inputs.append( + tritonhttpclient.InferInput("INPUT1", full_shape, self.dtype_str_) + ) + inputs[-1].set_data_from_numpy(input_np) + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + + model_name = tu.get_model_name( + self.model_name_, self.dtype_, self.dtype_, self.dtype_ + ) + results = tritonhttpclient.InferenceServerClient( + "localhost:8000", verbose=True + ).infer(model_name=model_name, inputs=inputs, outputs=outputs) + # Validate the results by comparing with precomputed values. + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") + self.assertFalse( + np.array_equal(output0_np, expected_output0_np), + "expects OUTPUT0 is not correct", + ) + self.assertFalse( + np.array_equal(output1_np, expected_output1_np), + "expects OUTPUT1 is not correct", + ) + + def test_fixed_shape(self): + tensor_shape = (16,) + self._check_infer(tensor_shape) + # Inference that should not have CUDA graph captured + self._check_infer(tensor_shape, 5) + + def test_dynamic_shape(self): + tensor_shape = (16,) + self._check_infer(tensor_shape) + # Inference that should not have CUDA graph captured + self._check_infer((20,)) + self._check_infer(tensor_shape, 5) + + def test_range_fixed_shape(self): + tensor_shape = (16,) + # Inferences that are in range of captured CUDA graph, + # model should tolerate difference in batch size + self._check_infer(tensor_shape, 4) + self._check_infer(tensor_shape, 2) + # Inferences that shouldn't use CUDA graph + self._check_infer(tensor_shape, 1) + self._check_infer(tensor_shape, 8) + + def test_range_dynamic_shape(self): + # Inferences that are in range of captured CUDA graph, + # model should tolerate difference in batch size + self._check_infer((16,), 4) + self._check_infer((16,), 2) + # Inference should return dummy result + # because the input shape is different + self._erroneous_infer((10,), 3) + + # Inferences that shouldn't use CUDA graph + self._check_infer((7,), 3) + self._check_infer((16,), 1) + self._check_infer((16,), 8) + self._check_infer((30,), 4) + + def test_nobatch_fixed_shape(self): + self._check_infer((16,), 0) + + +if __name__ == "__main__": + if len(sys.argv) > 2: + TrtCudaGraphTest.MODELNAME = sys.argv.pop() + + unittest.main() diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py new file mode 100755 index 0000000000..51137e8934 --- /dev/null +++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py @@ -0,0 +1,568 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import time +import unittest +from functools import partial + +import infer_util as iu +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +import tritonclient.utils.cuda_shared_memory as cshm +from tritonclient.utils import * + + +class CudaSharedMemoryTestBase(tu.TestResultCollector): + DEFAULT_SHM_BYTE_SIZE = 64 + + def setUp(self): + self._setup_client() + + def _setup_client(self): + self.protocol = os.environ.get("CLIENT_TYPE", "http") + if self.protocol == "http": + self.url = "localhost:8000" + self.triton_client = httpclient.InferenceServerClient( + self.url, verbose=True + ) + else: + self.url = "localhost:8001" + self.triton_client = grpcclient.InferenceServerClient( + self.url, verbose=True + ) + + def _configure_server( + self, + create_byte_size=DEFAULT_SHM_BYTE_SIZE, + register_byte_size=DEFAULT_SHM_BYTE_SIZE, + device_id=0, + ): + """Creates and registers cuda shared memory regions for testing. + + Parameters + ---------- + create_byte_size: int + Size of each cuda shared memory region to create. + NOTE: This should be sufficiently large to hold the inputs/outputs + stored in shared memory. + + register_byte_size: int + Size of each cuda shared memory region to register with server. + NOTE: The register_byte_size should be less than or equal + to the create_byte_size. Otherwise an exception will be raised for + an invalid set of registration args. + + device_id: int + The GPU device ID of the cuda shared memory region to be created. + + """ + + shm_ip0_handle = cshm.create_shared_memory_region( + "input0_data", create_byte_size, device_id + ) + shm_ip1_handle = cshm.create_shared_memory_region( + "input1_data", create_byte_size, device_id + ) + shm_op0_handle = cshm.create_shared_memory_region( + "output0_data", create_byte_size, device_id + ) + shm_op1_handle = cshm.create_shared_memory_region( + "output1_data", create_byte_size, device_id + ) + + input0_data = np.arange(start=0, stop=16, dtype=np.int32) + input1_data = np.ones(shape=16, dtype=np.int32) + cshm.set_shared_memory_region(shm_ip0_handle, [input0_data]) + cshm.set_shared_memory_region(shm_ip1_handle, [input1_data]) + + self.triton_client.register_cuda_shared_memory( + "input0_data", + cshm.get_raw_handle(shm_ip0_handle), + device_id, + register_byte_size, + ) + self.triton_client.register_cuda_shared_memory( + "input1_data", + cshm.get_raw_handle(shm_ip1_handle), + device_id, + register_byte_size, + ) + self.triton_client.register_cuda_shared_memory( + "output0_data", + cshm.get_raw_handle(shm_op0_handle), + device_id, + register_byte_size, + ) + self.triton_client.register_cuda_shared_memory( + "output1_data", + cshm.get_raw_handle(shm_op1_handle), + device_id, + register_byte_size, + ) + return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle] + + def _cleanup_server(self, shm_handles): + for shm_handle in shm_handles: + cshm.destroy_shared_memory_region(shm_handle) + + +class CudaSharedMemoryTest(CudaSharedMemoryTestBase): + def test_invalid_create_shm(self): + # Raises error since tried to create invalid cuda shared memory region + try: + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + except Exception as ex: + self.assertEqual(str(ex), "unable to create cuda shared memory handle") + + def test_valid_create_set_register(self): + # Create a valid cuda shared memory region, fill data in it and register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + cshm.set_shared_memory_region( + shm_op0_handle, [np.array([1, 2], dtype=np.float32)] + ) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 1) + else: + self.assertEqual(len(shm_status.regions), 1) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_before_register(self): + # Create a valid cuda shared memory region and unregister before register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.unregister_cuda_shared_memory("dummy_data") + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 0) + else: + self.assertEqual(len(shm_status.regions), 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_after_register(self): + # Create a valid cuda shared memory region and unregister after register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + self.triton_client.unregister_cuda_shared_memory("dummy_data") + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 0) + else: + self.assertEqual(len(shm_status.regions), 0) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_reregister_after_register(self): + # Create a valid cuda shared memory region and unregister after register + shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0) + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + try: + self.triton_client.register_cuda_shared_memory( + "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8 + ) + except Exception as ex: + self.assertIn( + "shared memory region 'dummy_data' already in manager", str(ex) + ) + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 1) + else: + self.assertEqual(len(shm_status.regions), 1) + cshm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_after_inference(self): + # Unregister after inference + error_msg = [] + shm_handles = self._configure_server() + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + protocol=self.protocol, + use_cuda_shared_memory=True, + ) + if len(error_msg) > 0: + raise Exception(str(error_msg)) + + self.triton_client.unregister_cuda_shared_memory("output0_data") + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 3) + else: + self.assertEqual(len(shm_status.regions), 3) + self._cleanup_server(shm_handles) + + def test_register_after_inference(self): + # Register after inference + error_msg = [] + shm_handles = self._configure_server() + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + protocol=self.protocol, + use_cuda_shared_memory=True, + ) + if len(error_msg) > 0: + raise Exception(str(error_msg)) + shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0) + self.triton_client.register_cuda_shared_memory( + "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64 + ) + shm_status = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(shm_status), 5) + else: + self.assertEqual(len(shm_status.regions), 5) + shm_handles.append(shm_ip2_handle) + self._cleanup_server(shm_handles) + + def test_too_big_shm(self): + # Shared memory input region larger than needed - Throws error + error_msg = [] + shm_handles = self._configure_server() + shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 128, 0) + self.triton_client.register_cuda_shared_memory( + "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128 + ) + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_ip2_handle, + shm_handles[2], + shm_handles[3], + error_msg, + big_shm_name="input2_data", + big_shm_size=128, + protocol=self.protocol, + use_cuda_shared_memory=True, + ) + if len(error_msg) > 0: + self.assertIn( + "input byte size mismatch for input 'INPUT1' for model 'simple'. Expected 64, got 128", + error_msg[-1], + ) + shm_handles.append(shm_ip2_handle) + self._cleanup_server(shm_handles) + + def test_mixed_raw_shm(self): + # Mix of shared memory and RAW inputs + error_msg = [] + shm_handles = self._configure_server() + input1_data = np.ones(shape=16, dtype=np.int32) + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + [input1_data], + shm_handles[2], + shm_handles[3], + error_msg, + protocol=self.protocol, + use_cuda_shared_memory=True, + ) + + if len(error_msg) > 0: + raise Exception(error_msg[-1]) + self._cleanup_server(shm_handles) + + def test_unregisterall(self): + # Unregister all shared memory blocks + shm_handles = self._configure_server() + status_before = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(status_before), 4) + else: + self.assertEqual(len(status_before.regions), 4) + self.triton_client.unregister_cuda_shared_memory() + status_after = self.triton_client.get_cuda_shared_memory_status() + if self.protocol == "http": + self.assertEqual(len(status_after), 0) + else: + self.assertEqual(len(status_after.regions), 0) + self._cleanup_server(shm_handles) + + def test_register_out_of_bound(self): + create_byte_size = self.DEFAULT_SHM_BYTE_SIZE + # Verify various edge cases of registered region size don't go out of bounds of the actual created shm region's size. + with self.assertRaisesRegex( + InferenceServerException, + "failed to register shared memory region.*invalid args", + ): + self._configure_server( + create_byte_size=create_byte_size, + register_byte_size=create_byte_size + 1, + ) + + def test_infer_offset_out_of_bound(self): + # CUDA Shared memory offset outside output region - Throws error + error_msg = [] + shm_handles = self._configure_server() + if self.protocol == "http": + # -32 when placed in an int64 signed type, to get a negative offset + # by overflowing + offset = 2**64 - 32 + else: + # gRPC will throw an error if > 2**63 - 1, so instead test for + # exceeding shm region size by 1 byte, given its size is 64 bytes + offset = 64 + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + shm_output_offset=offset, + protocol=self.protocol, + use_system_shared_memory=False, + use_cuda_shared_memory=True, + ) + + self.assertEqual(len(error_msg), 1) + self.assertIn("Invalid offset for shared memory region", error_msg[0]) + self._cleanup_server(shm_handles) + + def test_infer_byte_size_out_of_bound(self): + # Shared memory byte_size outside output region - Throws error + error_msg = [] + shm_handles = self._configure_server() + offset = 60 + byte_size = self.DEFAULT_SHM_BYTE_SIZE + + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + shm_output_offset=offset, + shm_output_byte_size=byte_size, + protocol=self.protocol, + use_system_shared_memory=False, + use_cuda_shared_memory=True, + ) + self.assertEqual(len(error_msg), 1) + self.assertIn( + "Invalid offset + byte size for shared memory region", error_msg[0] + ) + self._cleanup_server(shm_handles) + + +class TestCudaSharedMemoryUnregister(CudaSharedMemoryTestBase): + def _test_unregister_shm_fail(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory() + self.assertIn( + "Failed to unregister the following cuda shared memory regions: input0_data ,input1_data ,output0_data ,output1_data", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("input0_data") + self.assertIn( + "Cannot unregister shared memory region 'input0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("input1_data") + self.assertIn( + "Cannot unregister shared memory region 'input1_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("output0_data") + self.assertIn( + "Cannot unregister shared memory region 'output0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.unregister_cuda_shared_memory("output1_data") + self.assertIn( + "Cannot unregister shared memory region 'output1_data', it is currently in use.", + str(ex.exception), + ) + + def _test_shm_not_found(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("input0_data") + self.assertIn( + "Unable to find cuda shared memory region: 'input0_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("input1_data") + self.assertIn( + "Unable to find cuda shared memory region: 'input1_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("output0_data") + self.assertIn( + "Unable to find cuda shared memory region: 'output0_data'", + str(ex.exception), + ) + + with self.assertRaises(InferenceServerException) as ex: + second_client.get_cuda_shared_memory_status("output1_data") + self.assertIn( + "Unable to find cuda shared memory region: 'output1_data'", + str(ex.exception), + ) + + def test_unregister_shm_during_inference_http(self): + try: + self.triton_client.unregister_cuda_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + httpclient.InferInput("INPUT0", [1, 16], "INT32"), + httpclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0", binary_data=True), + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + async_request = self.triton_client.async_infer( + model_name="simple", inputs=inputs, outputs=outputs + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Blocking call + async_request.get_result() + + # Try unregister shm regions after inference + self.triton_client.unregister_cuda_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + def test_unregister_shm_during_inference_grpc(self): + try: + self.triton_client.unregister_cuda_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + grpcclient.InferRequestedOutput("OUTPUT0"), + grpcclient.InferRequestedOutput("OUTPUT1"), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + user_data = [] + + self.triton_client.async_infer( + model_name="simple", + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Wait until the results are available in user_data + time_out = 20 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + time.sleep(2) + + # Try unregister shm regions after inference + self.triton_client.unregister_cuda_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh new file mode 100755 index 0000000000..b7126a9295 --- /dev/null +++ b/qa/L0_cuda_shared_memory/test.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +SHM_TEST=cuda_shared_memory_test.py + +TEST_RESULT_FILE='test_results.txt' +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 +rm -fr *.log + +for i in \ + test_invalid_create_shm \ + test_valid_create_set_register \ + test_unregister_before_register \ + test_unregister_after_register \ + test_reregister_after_register \ + test_unregister_after_inference \ + test_register_after_inference \ + test_too_big_shm \ + test_mixed_raw_shm \ + test_unregisterall \ + test_register_out_of_bound \ + test_infer_offset_out_of_bound \ + test_infer_byte_size_out_of_bound; do + for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" + SERVER_LOG="./$i.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + echo "Test: $i, client type: $client_type" >>$CLIENT_LOG + + set +e + python $SHM_TEST CudaSharedMemoryTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + done +done + +mkdir -p python_models/simple/1/ +cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/ +cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/ +sed -i 's/KIND_CPU/KIND_GPU/g' ./python_models/simple/config.pbtxt + +for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./unregister_shm.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + CLIENT_LOG="./unregister_shm.$client_type.client.log" + set +e + python3 $SHM_TEST TestCudaSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + kill $SERVER_PID + wait $SERVER_PID + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Server shut down non-gracefully\n***" + RET=1 + fi + set -e + done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_custom_model_config/test.sh b/qa/L0_custom_model_config/test.sh new file mode 100755 index 0000000000..d839cacbd5 --- /dev/null +++ b/qa/L0_custom_model_config/test.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +DATADIR="/data/inferenceserver/${REPO_VERSION}" +CLIENT_LOG="./client.log" +SERVER_LOG="./inference_server.log" + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 +rm -fr *.log + +rm -fr models && mkdir models +cp -r $DATADIR/qa_model_repository/savedmodel_nobatch_float32_float32_float32 models/. +mkdir models/savedmodel_nobatch_float32_float32_float32/configs + +test_custom_config() +{ + VERSION=$@ + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/models/savedmodel_nobatch_float32_float32_float32/config` + set -e + if [ "$code" != "200" ]; then + cat $out.out + echo -e "\n***\n*** Test Failed to GET model configuration\n***" + RET=1 + fi + + matches=`grep -o "\"version_policy\":{\"specific\":{\"versions\":\[$VERSION\]}}" curl.out | wc -l` + if [ $matches -ne 1 ]; then + cat curl.out + echo -e "\n***\n*** Expected 1 version_policy:specific:versions, got $matches\n***" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID +} + +# Prepare the file structure +VERSION_DEFAULT="1,3" +VERSION_H100="1" +VERSION_V100="2" +VERSION_CUSTOM="3" + +# Distinguish configs with different model versions +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + sed -i "s/^version_policy:.*/version_policy: { specific: { versions: [$VERSION_DEFAULT] }}/" config.pbtxt) +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + cp config.pbtxt configs/h100.pbtxt && \ + sed -i "s/^version_policy:.*/version_policy: { specific: { versions: [$VERSION_H100] }}/" configs/h100.pbtxt) +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + cp config.pbtxt configs/v100.pbtxt && \ + sed -i "s/^version_policy:.*/version_policy: { specific: { versions: [$VERSION_V100] }}/" configs/v100.pbtxt) +(cd models/savedmodel_nobatch_float32_float32_float32 && \ + cp config.pbtxt configs/config.pbtxt && \ + sed -i "s/^version_policy:.*/version_policy: { specific: { versions: [$VERSION_CUSTOM] }}/" configs/config.pbtxt) + +# Test default model config +SERVER_ARGS="--model-repository=`pwd`/models" +test_custom_config $VERSION_DEFAULT + +# Test model-config-name=h100 +SERVER_ARGS="--model-repository=`pwd`/models --model-config-name=h100" +test_custom_config $VERSION_H100 + +# Test model-config-name=v100 +SERVER_ARGS="--model-repository=`pwd`/models --model-config-name=v100" +test_custom_config $VERSION_V100 + +# Test model-config-name=config +SERVER_ARGS="--model-repository=`pwd`/models --model-config-name=config" +test_custom_config $VERSION_CUSTOM + +# Test model-config-name=h200. Expect fall back to default config since h200 config does not exist. +SERVER_ARGS="--model-repository=`pwd`/models --model-config-name=h200" +test_custom_config $VERSION_DEFAULT + +# Test model-config-name= +SERVER_ARGS="--model-repository=`pwd`/models --model-config-name=" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Failed: $SERVER started successfully when it was expected to fail\n***" + cat $SERVER_LOG + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_custom_ops/cuda_op_test.py b/qa/L0_custom_ops/cuda_op_test.py new file mode 100755 index 0000000000..bc610b3f0b --- /dev/null +++ b/qa/L0_custom_ops/cuda_op_test.py @@ -0,0 +1,108 @@ +#!/usr/bin/python + +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys +from builtins import range + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.") + + FLAGS = parser.parse_args() + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + + # Run the cudaop model, which depends on a custom operation that + # uses CUDA. The custom operator adds one to each input + model_name = FLAGS.model + elements = 8 + + # Create the inference context for the model. + client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) + + # Create the data for one input tensor. + input_data = np.arange(start=42, stop=42 + elements, dtype=np.int32) + + inputs = [ + client_util.InferInput( + "in", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + + results = client.infer(model_name, inputs) + output_data = results.as_numpy("out") + if output_data is None: + print("error: expected 'out'") + sys.exit(1) + + for i in range(elements): + print( + str(i) + ": input " + str(input_data[i]) + ", output " + str(output_data[i]) + ) + if output_data[i] != (input_data[i] + 1): + print("error: incorrect value") + sys.exit(1) diff --git a/qa/L0_custom_ops/mod_op_test.py b/qa/L0_custom_ops/mod_op_test.py new file mode 100755 index 0000000000..f0f2ccb79d --- /dev/null +++ b/qa/L0_custom_ops/mod_op_test.py @@ -0,0 +1,121 @@ +#!/usr/bin/python + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys +from builtins import range + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.") + + FLAGS = parser.parse_args() + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + + # Run the custom_modulo model, which depends on a custom mod operation + model_name = FLAGS.model + elements = 10 + + # Create the inference context for the model. + client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) + + # Create the data for one input tensor. + input_data = [] + input_data.append(np.arange(start=1, stop=1 + elements, dtype=np.float32)) + input_data.append(np.array([2] * elements, dtype=np.float32)) + + inputs = [] + for i in range(len(input_data)): + inputs.append( + client_util.InferInput( + "INPUT__{}".format(i), + input_data[0].shape, + np_to_triton_dtype(input_data[0].dtype), + ) + ) + inputs[i].set_data_from_numpy(input_data[i]) + + results = client.infer(model_name, inputs) + + # We expect 1 result of size 10 with alternating 1 and 0. + output_data = results.as_numpy("OUTPUT__0") + if output_data is None: + print("error: expected 'OUTPUT__0'") + sys.exit(1) + + for i in range(elements): + print( + str(i) + + ": " + + str(input_data[0][i]) + + " % " + + str(input_data[1][i]) + + " = " + + str(output_data[i]) + ) + if (input_data[0][i] % input_data[1][i]) != output_data[i]: + print("error: incorrect value") + sys.exit(1) diff --git a/qa/L0_custom_ops/onnx_op_test.py b/qa/L0_custom_ops/onnx_op_test.py new file mode 100755 index 0000000000..d717447156 --- /dev/null +++ b/qa/L0_custom_ops/onnx_op_test.py @@ -0,0 +1,119 @@ +#!/usr/bin/python + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys +from builtins import range + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.") + + FLAGS = parser.parse_args() + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + + # Run the custom_modulo model, which depends on a custom mod operation + model_name = FLAGS.model + shape = (3, 5) + dtype = np.float32 + + # Create the inference context for the model. + client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) + + # Create the data for one input tensor. + input_data = [] + input_data.append(np.ones((3, 5), dtype=np.float32)) + input_data.append(np.ones((3, 5), dtype=np.float32)) + + inputs = [] + for i in range(len(input_data)): + inputs.append( + client_util.InferInput( + "input_{}".format(i + 1), shape, np_to_triton_dtype(dtype) + ) + ) + inputs[i].set_data_from_numpy(input_data[i]) + + results = client.infer(model_name, inputs) + + # We expect 1 result of size 10 with alternating 1 and 0. + output_data = results.as_numpy("output") + if output_data is None: + print("error: expected 'output'") + sys.exit(1) + + for i in range(3): + for j in range(5): + print( + str(input_data[0][i][j]) + + " + " + + str(input_data[1][i][j]) + + " = " + + str(output_data[i][j]) + ) + if (input_data[0][i][j] + input_data[1][i][j]) != output_data[i][j]: + print("error: incorrect value") + sys.exit(1) diff --git a/qa/L0_custom_ops/test.sh b/qa/L0_custom_ops/test.sh new file mode 100755 index 0000000000..a12c1d67a4 --- /dev/null +++ b/qa/L0_custom_ops/test.sh @@ -0,0 +1,245 @@ +#!/bin/bash +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +ZERO_OUT_TEST=zero_out_test.py +CUDA_OP_TEST=cuda_op_test.py +MOD_OP_TEST=mod_op_test.py +VISION_OP_TEST=vision_op_test.py +ONNX_OP_TEST=onnx_op_test.py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f $SERVER_LOG $CLIENT_LOG + +RET=0 + +# Must explicitly set LD_LIBRARY_PATH so that the custom operations +# can find libtensorflow_framework.so. +LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow:$LD_LIBRARY_PATH + +# Tensorflow +## Load operations via LD_PRELOAD +SERVER_ARGS="--model-repository=/data/inferenceserver/${REPO_VERSION}/qa_custom_ops/tf_custom_ops" +SERVER_LD_PRELOAD="/data/inferenceserver/${REPO_VERSION}/qa_custom_ops/tf_custom_ops/libzeroout.so:/data/inferenceserver/${REPO_VERSION}/qa_custom_ops/tf_custom_ops/libcudaop.so:/data/inferenceserver/${REPO_VERSION}/qa_custom_ops/tf_custom_ops/libbusyop.so" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $ZERO_OUT_TEST -v -m graphdef_zeroout >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +python $ZERO_OUT_TEST -v -m savedmodel_zeroout >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +python $CUDA_OP_TEST -v -m graphdef_cudaop >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +python $CUDA_OP_TEST -v -m savedmodel_cudaop >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +## Load operations via model config +SERVER_ARGS="--model-repository=tf_custom_ops" +SERVER_LD_PRELOAD="" + +rm -rf tf_custom_ops && \ + mkdir -p tf_custom_ops && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_custom_ops/tf_custom_ops . + +for MODEL_TYPE in savedmodel graphdef; do + echo "model_operations { op_library_filename: \"tf_custom_ops/libbusyop.so\" }" >> tf_custom_ops/${MODEL_TYPE}_busyop/config.pbtxt + echo "model_operations { op_library_filename: \"tf_custom_ops/libcudaop.so\" }" >> tf_custom_ops/${MODEL_TYPE}_cudaop/config.pbtxt + echo "model_operations { op_library_filename: \"tf_custom_ops/libzeroout.so\" }" >> tf_custom_ops/${MODEL_TYPE}_zeroout/config.pbtxt +done + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $ZERO_OUT_TEST -v -m graphdef_zeroout >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +python $ZERO_OUT_TEST -v -m savedmodel_zeroout >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +python $CUDA_OP_TEST -v -m graphdef_cudaop >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +python $CUDA_OP_TEST -v -m savedmodel_cudaop >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Must set LD_LIBRARY_PATH just for the server launch so that the +# custom operations can find libtorch.so and other pytorch dependencies. +LD_LIBRARY_PATH=/opt/tritonserver/backends/pytorch:$LD_LIBRARY_PATH + +# Pytorch +SERVER_ARGS="--model-repository=/data/inferenceserver/${REPO_VERSION}/qa_custom_ops/libtorch_custom_ops" +# FIXME: Pre-loading the python library system to satisfy the symbol definitions +# as the custom op library is built with different python version within +# pytorch container. See DLIS-4152. +SERVER_LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libpython3.10.so.1:/data/inferenceserver/${REPO_VERSION}/qa_custom_ops/libtorch_custom_ops/libtorch_modulo/custom_modulo.so" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $MOD_OP_TEST -v -m libtorch_modulo >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +python $VISION_OP_TEST -v -m libtorch_visionop >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +kill $SERVER_PID +wait $SERVER_PID + +# ONNX +rm -rf onnx_custom_ops && \ + mkdir -p onnx_custom_ops/custom_op/1 && \ + cp custom_op_test.onnx onnx_custom_ops/custom_op/1/model.onnx + +touch onnx_custom_ops/custom_op/config.pbtxt +echo "name: \"custom_op\"" >> onnx_custom_ops/custom_op/config.pbtxt && \ +echo "platform: \"onnxruntime_onnx\"" >> onnx_custom_ops/custom_op/config.pbtxt && \ +echo "max_batch_size: 0" >> onnx_custom_ops/custom_op/config.pbtxt && \ +echo "model_operations { op_library_filename: \"./libcustom_op_library.so\" }" >> onnx_custom_ops/custom_op/config.pbtxt + +SERVER_ARGS="--model-repository=onnx_custom_ops --strict-model-config=false" +SERVER_LD_PRELOAD="" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $ONNX_OP_TEST -v -m custom_op >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +kill $SERVER_PID +wait $SERVER_PID + +exit $RET diff --git a/qa/L0_custom_ops/vision_op_test.py b/qa/L0_custom_ops/vision_op_test.py new file mode 100755 index 0000000000..88857c3d12 --- /dev/null +++ b/qa/L0_custom_ops/vision_op_test.py @@ -0,0 +1,111 @@ +#!/usr/bin/python + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.") + + FLAGS = parser.parse_args() + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + + # Run the libtorch_visionop model, which depends on a torchvision custom operation + model_name = FLAGS.model + + # Create the inference context for the model. + client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) + + # Create the data for the input tensors. + input_data = np.random.rand(1, 3, 10, 10).astype(np.float32) + box_data = np.array([[1, 1, 2, 3, 4]]).astype(np.float32) + + inputs = [] + inputs.append( + client_util.InferInput( + "INPUT__0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ) + inputs[0].set_data_from_numpy(input_data) + inputs.append( + client_util.InferInput( + "INPUT__1", box_data.shape, np_to_triton_dtype(box_data.dtype) + ) + ) + inputs[1].set_data_from_numpy(box_data) + + results = client.infer(model_name, inputs) + + # We expect 1 result of shape [1, 3, 5, 5]. + output_data = results.as_numpy("OUTPUT__0") + if output_data is None: + print("error: expected 'OUTPUT__0'") + sys.exit(1) + + if output_data.shape != (1, 3, 5, 5): + print("error: incorrect shape " + str(output_data.shape) + "for 'OUTPUT__0'") + sys.exit(1) diff --git a/qa/L0_custom_ops/zero_out_test.py b/qa/L0_custom_ops/zero_out_test.py new file mode 100755 index 0000000000..86fdcb8a30 --- /dev/null +++ b/qa/L0_custom_ops/zero_out_test.py @@ -0,0 +1,111 @@ +#!/usr/bin/python + +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys +from builtins import range + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.") + + FLAGS = parser.parse_args() + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + + # Run the zero-out model, which depends on a custom operation + model_name = FLAGS.model + elements = 8 + + # Create the inference context for the model. + client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) + + # Create the data for one input tensor. + input_data = np.arange(start=42, stop=42 + elements, dtype=np.int32) + + inputs = [ + client_util.InferInput( + "to_zero", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + results = client.infer(model_name, inputs) + + # We expect 1 result with all inputs except first to be zeroed. + output_data = results.as_numpy("zeroed") + if output_data is None: + print("error: expected 'zeroed'") + sys.exit(1) + + for i in range(elements): + print( + str(i) + ": input " + str(input_data[i]) + ", output " + str(output_data[i]) + ) + if (i == 0) and (input_data[i] != output_data[i]): + print("error: incorrect value") + sys.exit(1) + if (i != 0) and (output_data[i] != 0): + print("error: expected 0") + sys.exit(1) diff --git a/qa/L0_data_compression/test.sh b/qa/L0_data_compression/test.sh new file mode 100755 index 0000000000..9ec0487a4b --- /dev/null +++ b/qa/L0_data_compression/test.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +TEST_LOG="./data_compressor_test.log" +DATA_COMPRESSOR_TEST=./data_compressor_test + + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log *_data + +set +e + +echo "All work and no play makes Jack a dull boy" >> raw_data +python3 validation.py generate_compressed_data + +LD_LIBRARY_PATH=/opt/tritonserver/lib:${LD_LIBRARY_PATH} $DATA_COMPRESSOR_TEST >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Data Compression Test Failed\n***" + RET=1 +fi + +python3 validation.py validate_compressed_data +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Data Compression Failed\n***" + RET=1 +fi + +set -e + +# End-to-end testing with simple model +function run_data_compression_infer_client() { + local client_path=$1 + local request_algorithm=$2 + local response_algorithm=$3 + local log_path=$4 + + local python_or_cpp=`echo -n "$client_path" | tail -c 3` + if [ "$python_or_cpp" == ".py" ]; then + local infer_client="python $client_path" + local request_cmd_option="--request-compression-algorithm $request_algorithm" + local response_cmd_option="--response-compression-algorithm $response_algorithm" + else # C++ if not end with ".py" + local infer_client=$client_path + local request_cmd_option="-i $request_algorithm" + local response_cmd_option="-o $response_algorithm" + fi + + local cmd_options="-v" + if [ "$request_algorithm" != "" ]; then + cmd_options+=" $request_cmd_option" + fi + if [ "$response_algorithm" != "" ]; then + cmd_options+=" $response_cmd_option" + fi + + $infer_client $cmd_options >> $log_path 2>&1 + return $? +} + +SIMPLE_INFER_CLIENT_PY=../clients/simple_http_infer_client.py +SIMPLE_AIO_INFER_CLIENT_PY=../clients/simple_http_aio_infer_client.py +SIMPLE_INFER_CLIENT=../clients/simple_http_infer_client + +CLIENT_LOG=`pwd`/client.log +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +for INFER_CLIENT in "$SIMPLE_INFER_CLIENT_PY" "$SIMPLE_AIO_INFER_CLIENT_PY" "$SIMPLE_INFER_CLIENT"; do + for REQUEST_ALGORITHM in "deflate" "gzip" ""; do + for RESPONSE_ALGORITHM in "deflate" "gzip" ""; do + if [ "$REQUEST_ALGORITHM" == "$RESPONSE_ALGORITHM" ]; then + continue + fi + + set +e + run_data_compression_infer_client "$INFER_CLIENT" "$REQUEST_ALGORITHM" "$RESPONSE_ALGORITHM" "$CLIENT_LOG" + if [ $? -ne 0 ]; then + RET=1 + fi + set -e + done + done +done + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + cat $SERVER_LOG + cat ${CLIENT_LOG} + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_data_compression/validation.py b/qa/L0_data_compression/validation.py new file mode 100755 index 0000000000..a0e5cb1576 --- /dev/null +++ b/qa/L0_data_compression/validation.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + + +def generate_compressed_data(): + with open("raw_data", "rb") as f: + import gzip + import zlib + + raw_data = f.read() + with open("deflate_compressed_data", "wb") as of: + of.write(zlib.compress(raw_data)) + with open("gzip_compressed_data", "wb") as of: + of.write(gzip.compress(raw_data)) + + +def validate_compressed_data(): + with open("raw_data", "rb") as f: + import gzip + import zlib + + raw_data = f.read() + with open("generated_deflate_compressed_data", "rb") as cf: + decompressed_data = zlib.decompress(cf.read()) + if decompressed_data != raw_data: + exit(1) + with open("generated_gzip_compressed_data", "rb") as cf: + decompressed_data = gzip.decompress(cf.read()) + if decompressed_data != raw_data: + exit(1) + + +if __name__ == "__main__": + globals()[sys.argv[1]]() diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py new file mode 100755 index 0000000000..d7bc59f5c7 --- /dev/null +++ b/qa/L0_decoupled/decoupled_test.py @@ -0,0 +1,659 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import queue +import time +import unittest +from functools import partial + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + + +class UserData: + def __init__(self): + self._response_queue = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._response_queue.put(error) + else: + user_data._response_queue.put(result) + + +class DecoupledTest(tu.TestResultCollector): + def setUp(self): + self.trials_ = [ + ("repeat_int32", None), + ("simple_repeat", None), + ("sequence_repeat", None), + ("fan_repeat", self._fan_validate), + ("repeat_square", self._nested_validate), + ("nested_square", self._nested_validate), + ] + self.model_name_ = "repeat_int32" + + self.inputs_ = [] + self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32")) + self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32")) + self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32")) + + self.outputs_ = [] + self.outputs_.append(grpcclient.InferRequestedOutput("OUT")) + self.outputs_.append(grpcclient.InferRequestedOutput("IDX")) + # Some trials only expect a subset of outputs + self.requested_outputs_ = self.outputs_ + + # Client can receive a "triton_final_response" response parameter + # from Triton server that indicates when a response is the final response for + # its request. + # + # For non-decoupled models, there is a 1:1 request:response ratio, so every + # response is the final response, and this parameter is unnecessary. + # + # For decoupled models, there is a 1:N request:response ratio, so there may be + # more one response before receiving the "final" response. + # + # However, decoupled models have the unique property in that they can return + # a flags-only response to the server to indicate completion, which is not + # returned to the client by default (See TRITONBACKEND_ResponseFactorySendFlags). + # + # To forward this flags-only response to the client, users must opt-in to this + # behavior by adding the following argument: + # client.async_stream_infer(..., enable_empty_final_response=True). + # + # If the decoupled backend/model always sends the final response flag along + # with a non-null response, no opt-in is needed. + # + # With this behavior, the client can programmatically detect when all responses + # for an individual request have been received without knowing the expected + # number of responses in advance and without closing the stream. + def _stream_infer_with_params( + self, + request_count, + request_delay, + _, + delay_data, + delay_factor, + user_data, + result_dict, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + self.inputs_[1].set_data_from_numpy(delay_data) + triton_client.async_stream_infer( + model_name=self.model_name_, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + # Opt-in to receiving flags-only responses from model/backend + # to help detect final responses for decoupled models. + enable_empty_final_response=True, + ) + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + completed_requests = 0 + while completed_requests < request_count: + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + response = data_item.get_response() + # Request IDs should generally be provided with each request + # to associate decoupled responses with their requests. + if not response.id: + raise ValueError( + "No response id found. Was a request_id provided?" + ) + + # Detect final response. Parameters are oneof and we expect bool_param + if response.parameters.get("triton_final_response").bool_param: + completed_requests += 1 + + # Only process non-empty response, ignore if empty (no outputs) + if response.outputs: + if response.id not in result_dict: + result_dict[response.id] = [] + result_dict[response.id].append((recv_count, data_item)) + recv_count += 1 + + def _stream_infer( + self, + request_count, + request_delay, + expected_count, + delay_data, + delay_factor, + user_data, + result_dict, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + self.inputs_[1].set_data_from_numpy(delay_data) + triton_client.async_stream_infer( + model_name=self.model_name_, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + ) + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + while recv_count < expected_count: + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + this_id = data_item.get_response().id + if this_id not in result_dict: + result_dict[this_id] = [] + result_dict[this_id].append((recv_count, data_item)) + + recv_count += 1 + + def _fan_validate(self, result_list, data_offset, repeat_count): + # fan_repeat returns "2 * data_offset" as result + self.assertEqual(len(result_list), repeat_count) + expected_data = 2 * data_offset + for j in range(len(result_list)): + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + expected_data += 2 + + def _nested_validate(self, result_list, data_offset, repeat_count): + # if repeat model returns repeat result n, repeat_square-like model + # will return the same result n times + expected_len = sum(x for x in range(data_offset, data_offset + repeat_count)) + self.assertEqual(len(result_list), expected_len) + expected_data = data_offset + expected_count = expected_data + for j in range(len(result_list)): + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + expected_count -= 1 + if expected_count == 0: + expected_data += 1 + expected_count = expected_data + + def _decoupled_infer( + self, + request_count, + request_delay=0, + repeat_count=1, + data_offset=100, + delay_time=1000, + delay_factor=1, + wait_time=500, + order_sequence=None, + validate_fn=None, + ): + # Initialize data for IN + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + self.inputs_[0].set_shape([repeat_count]) + self.inputs_[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time + self.inputs_[1].set_shape([repeat_count]) + + # Initialize data for WAIT + wait_data = np.array([wait_time], dtype=np.uint32) + self.inputs_[2].set_data_from_numpy(wait_data) + + # use validate_fn to differentiate requested outputs + self.requested_outputs_ = ( + self.outputs_ if validate_fn is None else self.outputs_[0:1] + ) + + for infer_helper in [self._stream_infer, self._stream_infer_with_params]: + user_data = UserData() + result_dict = {} + + try: + if "square" not in self.model_name_: + expected_count = repeat_count * request_count + else: + expected_count = ( + sum(x for x in range(data_offset, data_offset + repeat_count)) + * request_count + ) + infer_helper( + request_count, + request_delay, + expected_count, + delay_data, + delay_factor, + user_data, + result_dict, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Validate the results.. + for i in range(request_count): + this_id = str(i) + if repeat_count != 0 and this_id not in result_dict.keys(): + self.assertTrue( + False, "response for request id {} not received".format(this_id) + ) + elif repeat_count == 0 and this_id in result_dict.keys(): + self.assertTrue( + False, + "received unexpected response for request id {}".format( + this_id + ), + ) + if repeat_count != 0: + if validate_fn is None: + self.assertEqual(len(result_dict[this_id]), repeat_count) + expected_data = data_offset + result_list = result_dict[this_id] + for j in range(len(result_list)): + if order_sequence is not None: + self.assertEqual( + result_list[j][0], order_sequence[i][j] + ) + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + this_idx = result_list[j][1].as_numpy("IDX") + self.assertEqual(len(this_idx), 1) + self.assertEqual(this_idx[0], j) + expected_data += 1 + else: + validate_fn(result_dict[this_id], data_offset, repeat_count) + + def test_one_to_none(self): + # Test cases where each request generates no response. + # Note the name of the test one_to_none implies the + # mapping between requests and responses. + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + self._decoupled_infer(request_count=1, repeat_count=0, validate_fn=trial[1]) + # Multiple request case + self._decoupled_infer(request_count=5, repeat_count=0, validate_fn=trial[1]) + + def test_one_to_one(self): + # Test cases where each request generates single response. + # Note the name of the test one_to_one implies the + # mapping between requests and responses. + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + # Release request before the response is delivered + self._decoupled_infer(request_count=1, wait_time=500, validate_fn=trial[1]) + # Release request after the response is delivered + self._decoupled_infer(request_count=1, wait_time=2000, validate_fn=trial[1]) + + # Multiple request case + # Release request before the response is delivered + self._decoupled_infer(request_count=5, wait_time=500, validate_fn=trial[1]) + # Release request after the response is delivered + self._decoupled_infer(request_count=5, wait_time=2000, validate_fn=trial[1]) + + def test_one_to_many(self): + # Test cases where each request generates multiple response. + # Note the name of the test one_to_many implies the + # mapping between requests and responses. + + self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=2000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=10000, validate_fn=trial[1] + ) + + # Multiple request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=2000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1] + ) + + def test_one_to_multi_many(self): + # Test cases where each request generates multiple response but the + # responses are delayed so as to stress the control path handling the + # queued responses. + + self.assertTrue("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=8000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=20000, validate_fn=trial[1] + ) + + # Multiple request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=3000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1] + ) + + def test_response_order(self): + # Test the expected response order for different cases + + self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) + + for trial in self.trials_: + self.model_name_ = trial[0] + + # Case 1: Interleaved responses + self._decoupled_infer( + request_count=2, + request_delay=500, + repeat_count=4, + order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]], + validate_fn=trial[1], + ) + + # Case 2: All responses of second request delivered before any + # response from the first + self._decoupled_infer( + request_count=2, + request_delay=500, + repeat_count=4, + delay_time=2000, + delay_factor=0.1, + order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]], + validate_fn=trial[1], + ) + + # Case 3: Similar to Case 2, but the second request is generated + # after the first response from first request is received + self._decoupled_infer( + request_count=2, + request_delay=2500, + repeat_count=4, + delay_time=2000, + delay_factor=0.1, + order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]], + validate_fn=trial[1], + ) + + # Case 4: All the responses of second requests are dleivered after + # all the responses from first requests are received + self._decoupled_infer( + request_count=2, + request_delay=100, + repeat_count=4, + delay_time=500, + delay_factor=10, + order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]], + validate_fn=trial[1], + ) + + # Case 5: Similar to Case 4, but the second request is generated + # after the first response from the first request is received + self._decoupled_infer( + request_count=2, + request_delay=750, + repeat_count=4, + delay_time=500, + delay_factor=10, + order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]], + validate_fn=trial[1], + ) + + def _no_streaming_helper(self, protocol): + data_offset = 100 + repeat_count = 1 + delay_time = 1000 + wait_time = 2000 + + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time + wait_data = np.array([wait_time], dtype=np.uint32) + + if protocol == "grpc": + # Use the inputs and outputs from the setUp + this_inputs = self.inputs_ + this_outputs = self.outputs_ + else: + this_inputs = [] + this_inputs.append(httpclient.InferInput("IN", [repeat_count], "INT32")) + this_inputs.append(httpclient.InferInput("DELAY", [1], "UINT32")) + this_inputs.append(httpclient.InferInput("WAIT", [1], "UINT32")) + this_outputs = [] + this_outputs.append(httpclient.InferRequestedOutput("OUT")) + + # Initialize data for IN + this_inputs[0].set_shape([repeat_count]) + this_inputs[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + this_inputs[1].set_shape([repeat_count]) + this_inputs[1].set_data_from_numpy(delay_data) + + # Initialize data for WAIT + this_inputs[2].set_data_from_numpy(wait_data) + + if protocol == "grpc": + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + else: + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + + with self.assertRaises(InferenceServerException) as cm: + triton_client.infer( + model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs + ) + + self.assertIn( + "doesn't support models with decoupled transaction policy", + str(cm.exception), + ) + + def test_no_streaming(self): + # Test cases with no streaming inference. Server should give + # appropriate error in such cases. + + self._no_streaming_helper("grpc") + self._no_streaming_helper("http") + + def test_wrong_shape(self): + # Sends mismatching shapes for IN and DELAY. Server should return + # appropriate error message. The shape of IN is [repeat_count], + # where as shape of DELAY is [repeat_count + 1]. + + data_offset = 100 + repeat_count = 1 + delay_time = 1000 + wait_time = 2000 + + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + delay_data = (np.ones([repeat_count + 1], dtype=np.uint32)) * delay_time + wait_data = np.array([wait_time], dtype=np.uint32) + + # Initialize data for IN + self.inputs_[0].set_shape([repeat_count]) + self.inputs_[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + self.inputs_[1].set_shape([repeat_count + 1]) + self.inputs_[1].set_data_from_numpy(delay_data) + + # Initialize data for WAIT + self.inputs_[2].set_data_from_numpy(wait_data) + + user_data = UserData() + result_dict = {} + + with self.assertRaises(InferenceServerException) as cm: + self._stream_infer( + 1, 0, repeat_count, delay_data, 1, user_data, result_dict + ) + + self.assertIn( + "expected IN and DELAY shape to match, got [1] and [2]", str(cm.exception) + ) + + +class NonDecoupledTest(tu.TestResultCollector): + def setUp(self): + self.model_name_ = "repeat_int32" + self.input_data = { + "IN": np.array([1], dtype=np.int32), + "DELAY": np.array([0], dtype=np.uint32), + "WAIT": np.array([0], dtype=np.uint32), + } + + def test_grpc(self): + inputs = [ + grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy( + self.input_data["IN"] + ), + grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( + self.input_data["DELAY"] + ), + grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( + self.input_data["WAIT"] + ), + ] + + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + # Expect the inference is successful + res = triton_client.infer(model_name=self.model_name_, inputs=inputs) + self.assertEqual(1, res.as_numpy("OUT")[0]) + self.assertEqual(0, res.as_numpy("IDX")[0]) + + def test_http(self): + inputs = [ + httpclient.InferInput("IN", [1], "INT32").set_data_from_numpy( + self.input_data["IN"] + ), + httpclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( + self.input_data["DELAY"] + ), + httpclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( + self.input_data["WAIT"] + ), + ] + + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + # Expect the inference is successful + res = triton_client.infer(model_name=self.model_name_, inputs=inputs) + self.assertEqual(1, res.as_numpy("OUT")[0]) + self.assertEqual(0, res.as_numpy("IDX")[0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_decoupled/models/fan_repeat/config.pbtxt b/qa/L0_decoupled/models/fan_repeat/config.pbtxt new file mode 100644 index 0000000000..f56b8c59f3 --- /dev/null +++ b/qa/L0_decoupled/models/fan_repeat/config.pbtxt @@ -0,0 +1,106 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "fan_repeat" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "repeat_out" + } + }, + { + model_name: "identity_int32" + model_version: -1 + input_map { + key: "INPUT0" + value: "repeat_out" + } + output_map { + key: "OUTPUT0" + value: "identity_out" + } + }, + { + model_name: "libtorch_nobatch_int32_int32_int32" + model_version: -1 + input_map { + key: "INPUT0" + value: "repeat_out" + } + input_map { + key: "INPUT1" + value: "identity_out" + } + output_map { + key: "OUTPUT__1" + value: "OUT" + } + + } + ] +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_decoupled/models/identity_int32/config.pbtxt b/qa/L0_decoupled/models/identity_int32/config.pbtxt new file mode 100644 index 0000000000..cee7f35f09 --- /dev/null +++ b/qa/L0_decoupled/models/identity_int32/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_int32" +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] diff --git a/qa/L0_decoupled/models/nested_square/config.pbtxt b/qa/L0_decoupled/models/nested_square/config.pbtxt new file mode 100644 index 0000000000..755c39854e --- /dev/null +++ b/qa/L0_decoupled/models/nested_square/config.pbtxt @@ -0,0 +1,89 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "nested_square" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "simple_repeat" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "repeat_out" + } + }, + { + model_name: "square_int32" + model_version: -1 + input_map { + key: "IN" + value: "repeat_out" + } + output_map { + key: "OUT" + value: "OUT" + } + } + ] +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_decoupled/models/repeat_square/config.pbtxt b/qa/L0_decoupled/models/repeat_square/config.pbtxt new file mode 100644 index 0000000000..112b0025e5 --- /dev/null +++ b/qa/L0_decoupled/models/repeat_square/config.pbtxt @@ -0,0 +1,89 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "repeat_square" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "repeat_out" + } + }, + { + model_name: "square_int32" + model_version: -1 + input_map { + key: "IN" + value: "repeat_out" + } + output_map { + key: "OUT" + value: "OUT" + } + } + ] +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_decoupled/models/sequence_repeat/config.pbtxt b/qa/L0_decoupled/models/sequence_repeat/config.pbtxt new file mode 100644 index 0000000000..3b23b4eb4c --- /dev/null +++ b/qa/L0_decoupled/models/sequence_repeat/config.pbtxt @@ -0,0 +1,98 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "sequence_repeat" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "repeat_out" + } + output_map { + key: "IDX" + value: "IDX" + } + }, + { + model_name: "identity_int32" + model_version: -1 + input_map { + key: "INPUT0" + value: "repeat_out" + } + output_map { + key: "OUTPUT0" + value: "OUT" + } + } + ] +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "IDX" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_decoupled/models/simple_repeat/config.pbtxt b/qa/L0_decoupled/models/simple_repeat/config.pbtxt new file mode 100644 index 0000000000..77f6ea98ec --- /dev/null +++ b/qa/L0_decoupled/models/simple_repeat/config.pbtxt @@ -0,0 +1,86 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple_repeat" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "OUT" + } + output_map { + key: "IDX" + value: "IDX" + } + } + ] +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "IDX" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh new file mode 100755 index 0000000000..22c37dff49 --- /dev/null +++ b/qa/L0_decoupled/test.sh @@ -0,0 +1,179 @@ +#!/bin/bash +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 +TEST_RESULT_FILE='test_results.txt' +DECOUPLED_TEST=decoupled_test.py + +rm -f *.log + +CLIENT_LOG=`pwd`/client.log +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + + +TRIALS="python custom" + +for trial in $TRIALS; do + if [ $trial == "python" ]; then + MODELDIR=`pwd`/python_models + else + MODELDIR=`pwd`/models + fi + + SERVER_ARGS="--model-repository=$MODELDIR" + cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/. + (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt) + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + for i in \ + test_one_to_none \ + test_one_to_one \ + test_one_to_many \ + test_no_streaming \ + test_response_order \ + test_wrong_shape; do + + echo "Test: $i" >>$CLIENT_LOG + set +e + python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + done + + # Will delay the writing of each response by the specified many milliseconds. + # This will ensure that there are multiple responses available to be written. + export TRITONSERVER_DELAY_GRPC_RESPONSE=2000 + + echo "Test: test_one_to_multi_many" >>$CLIENT_LOG + set +e + python $DECOUPLED_TEST DecoupledTest.test_one_to_multi_many >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + set -e + + unset TRITONSERVER_DELAY_GRPC_RESPONSE + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test the server frontend can merge the responses of non-decoupled model that +# sends inference response and COMPLETE flag separately. In other words, from +# the client's perspective there will still be one response. +NON_DECOUPLED_DIR=`pwd`/non_decoupled_models +rm -rf ${NON_DECOUPLED_DIR} && mkdir -p ${NON_DECOUPLED_DIR} +cp -r `pwd`/models/repeat_int32 ${NON_DECOUPLED_DIR}/. && \ + (cd ${NON_DECOUPLED_DIR}/repeat_int32 && \ + sed -i "s/decoupled: True/decoupled: False/" config.pbtxt) + +SERVER_ARGS="--model-repository=${NON_DECOUPLED_DIR}" +SERVER_LOG="./non_decoupled_inference_server.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +CLIENT_LOG=`pwd`/non_decoupled_client.log +echo "Test: NonDecoupledTest" >>$CLIENT_LOG +set +e +python $DECOUPLED_TEST NonDecoupledTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET \ No newline at end of file diff --git a/qa/L0_device_memory_tracker/test.py b/qa/L0_device_memory_tracker/test.py new file mode 100755 index 0000000000..1d443d1032 --- /dev/null +++ b/qa/L0_device_memory_tracker/test.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import unittest +from functools import partial + +import nvidia_smi +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient + + +class UnifiedClientProxy: + def __init__(self, client): + self.client_ = client + + def __getattr__(self, attr): + forward_attr = getattr(self.client_, attr) + if type(self.client_) == grpcclient.InferenceServerClient: + if attr == "get_model_config": + return lambda *args, **kwargs: forward_attr( + *args, **kwargs, as_json=True + )["config"] + elif attr == "get_inference_statistics": + return partial(forward_attr, as_json=True) + return forward_attr + + +class MemoryUsageTest(unittest.TestCase): + def setUp(self): + nvidia_smi.nvmlInit() + self.gpu_handle_ = nvidia_smi.nvmlDeviceGetHandleByIndex(0) + self.http_client_ = httpclient.InferenceServerClient(url="localhost:8000") + self.grpc_client_ = grpcclient.InferenceServerClient(url="localhost:8001") + + def tearDown(self): + nvidia_smi.nvmlShutdown() + + def report_used_gpu_memory(self): + info = nvidia_smi.nvmlDeviceGetMemoryInfo(self.gpu_handle_) + return info.used + + def is_testing_backend(self, model_name, backend_name): + return self.client_.get_model_config(model_name)["backend"] == backend_name + + def verify_recorded_usage(self, model_stat): + recorded_gpu_usage = 0 + for usage in model_stat["memory_usage"]: + if usage["type"] == "GPU": + recorded_gpu_usage += int(usage["byte_size"]) + # unload and verify recorded usage + before_total_usage = self.report_used_gpu_memory() + self.client_.unload_model(model_stat["name"]) + # unload can return before the model is fully unloaded, + # wait to be finished + time.sleep(2) + usage_delta = before_total_usage - self.report_used_gpu_memory() + # check with tolerance as gpu usage obtained is overall usage + self.assertTrue( + usage_delta * 0.9 <= recorded_gpu_usage <= usage_delta * 1.1, + msg="For model {}, expect recorded usage to be in range [{}, {}], got {}".format( + model_stat["name"], + usage_delta * 0.9, + usage_delta * 1.1, + recorded_gpu_usage, + ), + ) + + def test_onnx_http(self): + self.client_ = UnifiedClientProxy(self.http_client_) + model_stats = self.client_.get_inference_statistics()["model_stats"] + for model_stat in model_stats: + if self.is_testing_backend(model_stat["name"], "onnxruntime"): + self.verify_recorded_usage(model_stat) + + def test_plan_grpc(self): + self.client_ = UnifiedClientProxy(self.grpc_client_) + model_stats = self.client_.get_inference_statistics()["model_stats"] + for model_stat in model_stats: + if self.is_testing_backend(model_stat["name"], "tensorrt"): + self.verify_recorded_usage(model_stat) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_device_memory_tracker/test.sh b/qa/L0_device_memory_tracker/test.sh new file mode 100755 index 0000000000..0fffe94317 --- /dev/null +++ b/qa/L0_device_memory_tracker/test.sh @@ -0,0 +1,163 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_LOG="./test.log" +TEST_PY=test.py + +DATADIR=/data/inferenceserver/${REPO_VERSION} +rm -f *.log + +TRTEXEC=/usr/src/tensorrt/bin/trtexec +TEST_RESULT_FILE='test_results.txt' +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_LOG="./server.log" + +source ../common/util.sh + +RET=0 + +# prepare model repository, only contains ONNX and TRT models as the +# corresponding backend are known to be memory. +rm -rf models && mkdir models +# ONNX +cp -r /data/inferenceserver/${REPO_VERSION}/onnx_model_store/* models/. +rm -r models/*cpu + +set +e + +# VGG19 plan +rm -fr models/vgg19_plan && mkdir -p models/vgg19_plan/1 && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/vgg19_onnx/1/model.onnx models/vgg19_plan/ && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/vgg19_onnx/labels.txt models/vgg19_plan/ + +$TRTEXEC --onnx=models/vgg19_plan/model.onnx --saveEngine=models/vgg19_plan/1/model.plan \ + --minShapes=input:1x3x224x224 --optShapes=input:32x3x224x224 \ + --maxShapes=input:32x3x224x224 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to generate vgg19 PLAN\n***" + exit 1 +fi + +rm models/vgg19_plan/model.onnx +cp $DATADIR/qa_dynamic_batch_image_model_repository/vgg19_onnx/config.pbtxt models/vgg19_plan/ && \ +sed -i "s/^name: .*/name: \"vgg19_plan\"/g" models/vgg19_plan/config.pbtxt && \ +sed -i 's/^platform: .*/platform: "tensorrt_plan"/g' models/vgg19_plan/config.pbtxt + +# Resnet50 plan +rm -fr models/resnet50_plan && mkdir -p models/resnet50_plan/1 && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/1/model.onnx models/resnet50_plan/ && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/labels.txt models/resnet50_plan/ + +$TRTEXEC --onnx=models/resnet50_plan/model.onnx --saveEngine=models/resnet50_plan/1/model.plan \ + --minShapes=input:1x3x224x224 --optShapes=input:32x3x224x224 \ + --maxShapes=input:32x3x224x224 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to generate resnet50 PLAN\n***" + exit 1 +fi + +rm models/resnet50_plan/model.onnx +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/config.pbtxt models/resnet50_plan/ && \ +sed -i "s/^name: .*/name: \"resnet50_plan\"/g" models/resnet50_plan/config.pbtxt && \ +sed -i 's/^platform: .*/platform: "tensorrt_plan"/g' models/resnet50_plan/config.pbtxt + + +# Resnet152 plan +rm -fr models/resnet152_plan && mkdir -p models/resnet152_plan/1 && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet152_onnx/1/model.onnx models/resnet152_plan/ && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet152_onnx/labels.txt models/resnet152_plan/ + +$TRTEXEC --onnx=models/resnet152_plan/model.onnx --saveEngine=models/resnet152_plan/1/model.plan \ + --minShapes=input:1x3x224x224 --optShapes=input:32x3x224x224 \ + --maxShapes=input:32x3x224x224 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to generate resnet152 PLAN\n***" + exit 1 +fi + +rm models/resnet152_plan/model.onnx +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet152_onnx/config.pbtxt models/resnet152_plan/ && \ +sed -i "s/^name: .*/name: \"resnet152_plan\"/g" models/resnet152_plan/config.pbtxt && \ +sed -i 's/^platform: .*/platform: "tensorrt_plan"/g' models/resnet152_plan/config.pbtxt + +set -e + +# Set multiple instances on selected model to test instance-wise collection +# and accumulation. +echo "instance_group [{ count: 2; kind: KIND_GPU }]" >> models/resnet152_plan/config.pbtxt +echo "instance_group [{ count: 2; kind: KIND_GPU }]" >> models/densenet/config.pbtxt + +# testing use nvidia-smi for Python to validate the reported usage +pip install nvidia-ml-py3 + +# Start server to load all models (in parallel), then gradually unload +# the models and expect the memory usage changes matches what are reported +# in statistic. +SERVER_ARGS="--backend-config=triton-backend-memory-tracker=true --model-repository=models --model-control-mode=explicit --load-model=*" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST_PY > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi +set -e +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $SERVER_LOG + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_dlpack_multi_gpu/test.sh b/qa/L0_dlpack_multi_gpu/test.sh new file mode 100755 index 0000000000..ae72daa7d0 --- /dev/null +++ b/qa/L0_dlpack_multi_gpu/test.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +CLIENT_PY=./test_infer_shm_leak.py +CLIENT_LOG="./client.log" +EXPECTED_NUM_TESTS="1" +TEST_RESULT_FILE='test_results.txt' +SERVER_LOG="./inference_server.log" +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +RET=0 +rm -fr *.log ./models + +source ../common/util.sh + +# Uninstall the non CUDA version of PyTorch +pip3 uninstall -y torch +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html +pip3 install tensorflow + +# Install CuPy for testing non_blocking compute streams +pip3 install cupy-cuda12x + +rm -fr *.log ./models + +mkdir -p models/dlpack_test/1/ +cp ../python_models/dlpack_test/model.py models/dlpack_test/1/ +cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test +cp ../L0_backend_python/test_infer_shm_leak.py . +sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +fi + +set +e +export MODEL_NAME="dlpack_test" +python3 -m pytest --junitxml=dlpack_multi_gpu.report.xml $CLIENT_PY > $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** python_unittest.py FAILED. \n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** dlpack_multi_gpu test FAILED. \n***" +else + echo -e "\n***\n*** dlpack_multi_gpu test PASSED. \n***" +fi + +exit $RET diff --git a/qa/L0_doc_links/mkdocs.yml b/qa/L0_doc_links/mkdocs.yml new file mode 100644 index 0000000000..1588680d92 --- /dev/null +++ b/qa/L0_doc_links/mkdocs.yml @@ -0,0 +1,44 @@ +# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +site_name: CI Test +use_directory_urls: False +docs_dir: "./repos" +plugins: + - htmlproofer + - search diff --git a/qa/L0_doc_links/test.sh b/qa/L0_doc_links/test.sh new file mode 100755 index 0000000000..e30ddd59eb --- /dev/null +++ b/qa/L0_doc_links/test.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOG="`pwd`/log.txt" +CONFIG="`pwd`/mkdocs.yml" +RET=0 +# Download necessary packages +python3 -m pip install mkdocs +python3 -m pip install mkdocs-htmlproofer-plugin + +# Get the necessary repos +mkdir repos && cd repos +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +TRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG:="main"} +echo ${TRITON_BACKEND_REPO_TAG} +git clone --single-branch --depth=1 -b ${TRITON_BACKEND_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/backend.git +cd .. + +exec mkdocs serve -f $CONFIG > $LOG & +PID=$! +# Time for the compilation to finish. This needs to be increased if other repos +# are added to the test +sleep 20 + +until [[ (-z `pgrep mkdocs`) ]]; do + kill -2 $PID + sleep 2 +done + +if [[ ! -z `grep "invalid url" $LOG` ]]; then + cat $LOG + RET=1 +fi + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test PASSED\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +# exit $RET diff --git a/qa/L0_docs/test.sh b/qa/L0_docs/test.sh deleted file mode 100755 index a7f9b3ae38..0000000000 --- a/qa/L0_docs/test.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -TEST_LOG="./docs.log" - -rm -f $TEST_LOG -RET=0 - -apt-get update && \ - apt-get install -y --no-install-recommends doxygen && \ - pip install --upgrade sphinx sphinx-rtd-theme nbsphinx exhale && \ - pip install --upgrade /opt/tensorrtserver/pip/tensorrtserver-*.whl - -set +e - -(cd /workspace/docs && \ - make BUILDDIR=/opt/tensorrtserver/qa/L0_docs/build clean html) > $TEST_LOG 2>&1 -if [ $? -ne 0 ]; then - RET=1 -fi - -set -e - -if [ $RET -eq 0 ]; then - echo -e "\n***\n*** Test Passed\n***" -else - cat $TEST_LOG - echo -e "\n***\n*** Test FAILED\n***" -fi - -exit $RET diff --git a/qa/L0_dyna_implicit_state/test.sh b/qa/L0_dyna_implicit_state/test.sh new file mode 100755 index 0000000000..0721d5cd32 --- /dev/null +++ b/qa/L0_dyna_implicit_state/test.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export ENSEMBLES=0 +BACKENDS=${BACKENDS:="onnx plan"} +export BACKENDS +export IMPLICIT_STATE=1 + +(cd ../L0_dyna_sequence_batcher/ && bash -ex test.sh $REPO_VERSION) +RET=$? + +if [ $RET == 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py b/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py new file mode 100755 index 0000000000..f2c709469b --- /dev/null +++ b/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py @@ -0,0 +1,1299 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import threading +import time +import unittest +from builtins import str + +import numpy as np +import sequence_util as su +import test_util as tu + +_test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +_test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) + +NO_BATCHING = int(os.environ.get("NO_BATCHING", 0)) == 1 +BACKENDS = os.environ.get( + "BACKENDS", "graphdef savedmodel libtorch onnx plan custom custom_string" +) +IMPLICIT_STATE = int(os.environ["IMPLICIT_STATE"]) == 1 + +_trials = BACKENDS.split(" ") +for backend in BACKENDS.split(" "): + if NO_BATCHING: + if (backend != "custom") and (backend != "custom_string"): + _trials += (backend + "_nobatch",) + +_ragged_batch_supported_trials = [] +if "custom" in BACKENDS.split(" "): + _ragged_batch_supported_trials.append("custom") + +_protocols = ("http", "grpc") +_max_sequence_idle_ms = 5000 + + +class DynaSequenceBatcherTest(su.SequenceBatcherTestUtil): + def get_datatype(self, trial): + return np.int32 + + def get_expected_result(self, expected_result, corrid, value, trial, flag_str=None): + # Adjust the expected_result for models that + # could not implement the full accumulator. See + # qa/common/gen_qa_dyna_sequence_models.py for more + # information. + if ( + (("nobatch" not in trial) and ("custom" not in trial)) + or ("graphdef" in trial) + or ("plan" in trial) + or ("onnx" in trial) + or ("libtorch" in trial) + ): + expected_result = value + if flag_str is not None: + if "start" in flag_str: + expected_result += 1 + if "end" in flag_str: + if isinstance(corrid, str): + expected_result += int(corrid) + else: + expected_result += corrid + return expected_result + + def get_expected_result_implicit( + self, expected_result, corrid, value, trial, flag_str=None + ): + return expected_result + + def test_simple_sequence(self): + # Send one sequence and check for correct accumulator + # result. The result should be returned immediately. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + self.clear_deferred_exceptions() + try: + dtype = self.get_datatype(trial) + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + if "string" in trial: + corrid = "52" + else: + corrid = 52 + + expected_result = ( + self.get_expected_result( + 45 + int(corrid), corrid, 9, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 45, corrid, 9, trial, "end" + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + corrid, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + ( + ("start", 1, None, None), + (None, 2, None, None), + (None, 3, None, None), + (None, 4, None, None), + (None, 5, None, None), + (None, 6, None, None), + (None, 7, None, None), + (None, 8, None, None), + ("end", 9, None, None), + ), + expected_result, + protocol, + sequence_name="{}_{}".format(self._testMethodName, protocol), + ) + + self.check_deferred_exception() + self.check_status( + model_name, {1: 9 * (idx + 1)}, 9 * (idx + 1), 9 * (idx + 1) + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_length1_sequence(self): + # Send a length-1 sequence and check for correct accumulator + # result. The result should be returned immediately. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + self.clear_deferred_exceptions() + try: + dtype = self.get_datatype(trial) + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + if "string" in trial: + corrid = "99" + else: + corrid = 99 + + expected_result = ( + self.get_expected_result( + 42 + int(corrid), corrid, 42, trial, "start,end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 42, corrid, 42, trial, "start,end" + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + corrid, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + (("start,end", 42, None, None),), + expected_result, + protocol, + sequence_name="{}_{}".format(self._testMethodName, protocol), + ) + + self.check_deferred_exception() + self.check_status(model_name, {1: (idx + 1)}, (idx + 1), (idx + 1)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def _multi_sequence_impl( + self, trials, expected_batch_exec, expected_exec_cnt, sleep_secs, tensor_shapes + ): + for trial in trials: + self.clear_deferred_exceptions() + dtype = self.get_datatype(trial) + precreated_shm0_handles = self.precreate_register_regions( + (1, 3), dtype, 0, tensor_shape=(tensor_shapes[0],) + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13), dtype, 1, tensor_shape=(tensor_shapes[1],) + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 113), dtype, 2, tensor_shape=(tensor_shapes[2],) + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3, tensor_shape=(tensor_shapes[3],) + ) + try: + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + if "string" in trial: + corrids = ["1001", "1002", "1003", "1004"] + else: + corrids = [1001, 1002, 1003, 1004] + + expected_result = ( + self.get_expected_result( + 4 * tensor_shapes[0] + int(corrids[0]), + corrids[0], + 3, + trial, + "end", + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 4, corrids[0], 3, trial, "end" + ) + ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[0] + ), + "tensor_shape": (tensor_shapes[0],), + }, + ) + ) + + expected_result = ( + self.get_expected_result( + 36 * tensor_shapes[1] + int(corrids[1]), + corrids[1], + 13, + trial, + "end", + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 36, corrids[1], 13, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11, None), (None, 12, None), ("end", 13, None)), + expected_result, + precreated_shm1_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[1] + ), + "tensor_shape": (tensor_shapes[1],), + }, + ) + ) + + expected_result = ( + self.get_expected_result( + 336 * tensor_shapes[2] + int(corrids[2]), + corrids[2], + 113, + trial, + "end", + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 336, corrids[2], 113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[2] + ), + "tensor_shape": (tensor_shapes[2],), + }, + ) + ) + expected_result = ( + self.get_expected_result( + 3336 * tensor_shapes[3] + int(corrids[3]), + corrids[3], + 1113, + trial, + "end", + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, corrids[3], 1113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[3] + ), + "tensor_shape": (tensor_shapes[3],), + }, + ) + ) + + for t in threads: + t.start() + if sleep_secs > 0: + time.sleep(sleep_secs) + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status( + model_name, expected_batch_exec, expected_exec_cnt, 11 + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if _test_system_shared_memory or _test_cuda_shared_memory: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def test_multi_sequence(self): + # Send four sequences in series and make sure they get + # batched correctly. + self._multi_sequence_impl(_trials, {4: 2, 3: 1}, 3, 1, (1, 1, 1, 1)) + + def test_multi_parallel_sequence(self): + # Send four sequences in parallel and make sure they get + # batched correctly. + self._multi_sequence_impl(_trials, {4: 2, 3: 1}, 3, 0, (1, 1, 1, 1)) + + def test_multi_sequence_different_shape(self): + # Send four sequences in parallel where the requests in each + # sequence have different shape. Sequences should not be + # batched due to input tensor size differences. + self._multi_sequence_impl( + _ragged_batch_supported_trials, {1: 11}, 11, 0, (4, 3, 1, 2) + ) + + def test_multi_sequence_different_shape_allow_ragged(self): + # Send four sequences in parallel where the requests in each + # sequence have different shape. Input is marked as allowing + # ragged and so sequences should be batched even with input + # tensor size differences. + self._multi_sequence_impl( + _ragged_batch_supported_trials, {4: 2, 3: 1}, 3, 1, (4, 3, 1, 2) + ) + + def test_backlog(self): + # Send 5 equal-length sequences in parallel and make sure they + # get completely batched into batch-size 4 inferences plus the + # 5th should go in the backlog and then get handled once there + # is a free slot. + for trial in _trials: + self.clear_deferred_exceptions() + dtype = self.get_datatype(trial) + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111, 11112, 11113), dtype, 4 + ) + try: + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + if "string" in trial: + corrids = ["1001", "1002", "1003", "1004", "1005"] + else: + corrids = [1001, 1002, 1003, 1004, 1005] + + expected_result = ( + self.get_expected_result( + 6 + int(corrids[0]), corrids[0], 3, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, corrids[0], 3, trial, "end" + ) + ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + expected_result = ( + self.get_expected_result( + 36 + int(corrids[1]), corrids[1], 13, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 36, corrids[1], 13, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11, None), (None, 12, None), ("end", 13, None)), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + expected_result = ( + self.get_expected_result( + 336 + int(corrids[2]), corrids[2], 113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 336, corrids[2], 113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + expected_result = ( + self.get_expected_result( + 3336 + int(corrids[3]), corrids[3], 1113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, corrids[3], 1113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + expected_result = ( + self.get_expected_result( + 33336 + int(corrids[4]), corrids[4], 11113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 33336, corrids[4], 11113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[4], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11111, None), + (None, 11112, None), + ("end", 11113, None), + ), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {4: 3, 1: 3}, 6, 15) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if _test_system_shared_memory or _test_cuda_shared_memory: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + + def test_backlog_fill(self): + # Send 4 sequences in parallel, two of which are shorter. Send + # 2 additional sequences that should go into backlog but + # should immediately fill into the short sequences. + for trial in _trials: + self.clear_deferred_exceptions() + dtype = self.get_datatype(trial) + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111,), dtype, 4 + ) + precreated_shm5_handles = self.precreate_register_regions( + (22222,), dtype, 5 + ) + try: + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + if "string" in trial: + corrids = ["1001", "1002", "1003", "1004", "1005", "1006"] + else: + corrids = [1001, 1002, 1003, 1004, 1005, 1006] + threads = [] + + expected_result = ( + self.get_expected_result( + 6 + int(corrids[0]), corrids[0], 3, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, corrids[0], 3, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 24 + int(corrids[1]), corrids[1], 13, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 24, corrids[1], 13, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11, None), ("end", 13, None)), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 224 + int(corrids[2]), corrids[2], 113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 224, corrids[2], 113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 111, None), ("end", 113, None)), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 3336 + int(corrids[3]), corrids[3], 1113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, corrids[3], 1113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, 3000), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 11111 + int(corrids[4]), corrids[4], 11111, trial, "start,end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 11111, corrids[4], 11111, trial, "start,end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[4], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start,end", 11111, None),), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 22222 + int(corrids[5]), corrids[5], 22222, trial, "start,end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 22222, corrids[5], 22222, trial, "start,end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[5], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start,end", 22222, None),), + expected_result, + precreated_shm5_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + threads[3].start() + time.sleep(2) + threads[4].start() + threads[5].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {4: 3}, 3, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if _test_system_shared_memory or _test_cuda_shared_memory: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + self.cleanup_shm_regions(precreated_shm5_handles) + + def test_backlog_fill_no_end(self): + # Send 4 sequences in parallel, two of which are shorter. Send + # 2 additional sequences that should go into backlog but + # should immediately fill into the short sequences. One of + # those sequences is filled before it gets its end request. + for trial in _trials: + self.clear_deferred_exceptions() + dtype = self.get_datatype(trial) + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111,), dtype, 4 + ) + precreated_shm5_handles = self.precreate_register_regions( + (22222, 22223, 22224), dtype, 5 + ) + try: + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + if "string" in trial: + corrids = ["1001", "1002", "1003", "1004", "1005", "1006"] + else: + corrids = [1001, 1002, 1003, 1004, 1005, 1006] + threads = [] + expected_result = ( + self.get_expected_result( + 6 + int(corrids[0]), corrids[0], 3, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, corrids[0], 3, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 24 + int(corrids[1]), corrids[1], 13, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 24, corrids[1], 13, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11, None), ("end", 13, None)), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 224 + int(corrids[2]), corrids[2], 113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 224, corrids[2], 113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 111, None), ("end", 113, None)), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 3336 + int(corrids[3]), corrids[3], 1113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, corrids[3], 1113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, 3000), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 11111 + int(corrids[4]), corrids[4], 11111, trial, "start,end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 11111, corrids[4], 11111, trial, "start,end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[4], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start,end", 11111, None),), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 66669 + int(corrids[5]), corrids[5], 22224, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 66669, corrids[5], 22224, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[5], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 22222, None), + (None, 22223, None), + ("end", 22224, 2000), + ), + expected_result, + precreated_shm5_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + threads[3].start() + time.sleep(2) + threads[4].start() + threads[5].start() + for t in threads: + t.join() + self.check_deferred_exception() + # Expecting the requests of the same sequence to be in the same + # slot, so the execution for thelast long sequence will be + # padded to a batch. + self.check_status(model_name, {4: 3, 1: 2}, 5, 14) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if _test_system_shared_memory or _test_cuda_shared_memory: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + self.cleanup_shm_regions(precreated_shm5_handles) + + def test_backlog_sequence_timeout(self): + # Send 4 sequences in parallel and make sure they get + # completely batched into batch-size 4 inferences. One of the + # sequences has a long delay that causes it to timeout and + # that allows a 5th sequence to come out of the backlog and + # finish. The timed-out sequence will then send the delayed + # inference but it will appear as a new sequence and so fail + # because it doesn't have the START flag. + for trial in _trials: + self.clear_deferred_exceptions() + dtype = self.get_datatype(trial) + precreated_shm0_handles = self.precreate_register_regions((1, 3), dtype, 0) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 12, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 112, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111, 11113), dtype, 4 + ) + try: + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + if "string" in trial: + corrids = ["1001", "1002", "1003", "1004", "1005"] + else: + corrids = [1001, 1002, 1003, 1004, 1005] + threads = [] + expected_result = ( + self.get_expected_result( + 4 + int(corrids[0]), corrids[0], 3, trial, None + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 4, corrids[0], 3, trial, None + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1, None), + (None, 3, _max_sequence_idle_ms + 1000), + ), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 48 + int(corrids[1]), corrids[1], 13, trial, None + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 48, corrids[1], 13, trial, None + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, _max_sequence_idle_ms / 2), + (None, 12, _max_sequence_idle_ms / 2), + ("end", 13, _max_sequence_idle_ms / 2), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 448 + int(corrids[2]), corrids[2], 113, trial, None + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 448, corrids[2], 113, trial, None + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, _max_sequence_idle_ms / 2), + (None, 112, _max_sequence_idle_ms / 2), + ("end", 113, _max_sequence_idle_ms / 2), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 4448 + int(corrids[3]), corrids[3], 1113, trial, None + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 4448, corrids[3], 1113, trial, None + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, _max_sequence_idle_ms / 2), + (None, 1112, _max_sequence_idle_ms / 2), + ("end", 1113, _max_sequence_idle_ms / 2), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result( + 22224 + int(corrids[4]), corrids[4], 11113, trial, "end" + ) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 22224, corrids[4], 11113, trial, "end" + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[4], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11111, None), ("end", 11113, None)), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + threads[3].start() + time.sleep(2) + threads[4].start() + for t in threads: + t.join() + + self.check_deferred_exception() + self.assertTrue(False, "expected error") + except Exception as ex: + self.assertTrue( + ex.message().startswith( + str( + "inference request for sequence 1001 to " + + "model '{}' must specify the START flag on the first " + + "request of the sequence" + ).format(model_name) + ) + ) + finally: + if _test_system_shared_memory or _test_cuda_shared_memory: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_dyna_sequence_batcher/test.sh b/qa/L0_dyna_sequence_batcher/test.sh new file mode 100755 index 0000000000..acac8399af --- /dev/null +++ b/qa/L0_dyna_sequence_batcher/test.sh @@ -0,0 +1,230 @@ +#!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +CLIENT_LOG="./client.log" +BATCHER_TEST=dyna_sequence_batcher_test.py + +DATADIR=/data/inferenceserver/${REPO_VERSION} + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +export CUDA_VISIBLE_DEVICES=0 + +# If IMPLICIT_STATE not specified, set to 0 +IMPLICIT_STATE=${IMPLICIT_STATE:="0"} +export IMPLICIT_STATE + +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="graphdef savedmodel libtorch onnx plan custom custom_string"} +export BACKENDS + +MODEL_REPOSITORY='' +if [ "$IMPLICIT_STATE" == "1" ]; then + MODEL_REPOSITORY="qa_dyna_sequence_implicit_model_repository" +else + MODEL_REPOSITORY="qa_dyna_sequence_model_repository" +fi + +RET=0 + +rm -fr *.log + +# models +rm -fr models && mkdir models +for MODEL in ${DATADIR}/$MODEL_REPOSITORY/* ; do + cp -r $MODEL models/. && \ + (cd models/$(basename $MODEL) && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" config.pbtxt) +done + +# Implicit state models for custom backend do not exist. +if [ $IMPLICIT_STATE == "0" ]; then + cp -r ../custom_models/custom_dyna_sequence_int32 models/. + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" models/custom_dyna_sequence_int32/config.pbtxt + # Construct custom dyna_sequence_model with STRING sequence ID. Copy model and edit config.pbtxt + cp -r models/custom_dyna_sequence_int32 models/custom_string_dyna_sequence_int32 + sed -i "s/custom_dyna_sequence_int32/custom_string_dyna_sequence_int32/g" models/custom_string_dyna_sequence_int32/config.pbtxt + sed -i "/CONTROL_SEQUENCE_CORRID/{n;s/data_type:.*/data_type: TYPE_STRING/}" models/custom_string_dyna_sequence_int32/config.pbtxt +fi + +# Implicit state models that support ragged batching do not exist. +if [ $IMPLICIT_STATE == "0" ]; then + # ragged models + rm -fr ragged_models && mkdir ragged_models + cp -r ../custom_models/custom_dyna_sequence_int32 ragged_models/. + (cd ragged_models/custom_dyna_sequence_int32 && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/name:.*\"INPUT\"/name: \"INPUT\"\\nallow_ragged_batch: true/" config.pbtxt) +fi + +# Need to launch the server for each test so that the model status is +# reset (which is used to make sure the correct batch size was used +# for execution). Test everything with fixed-tensor-size models and +# variable-tensor-size models. +export NO_BATCHING=1 +for i in \ + test_simple_sequence \ + test_length1_sequence \ + ; do + SERVER_LOG="./$i.server.log" + SERVER_ARGS="--model-repository=`pwd`/models" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $BATCHER_TEST DynaSequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Tests that require max_queue_delay_microseconds to be non-zero so +# that batching is delayed until a full preferred batch is available. +for m in `ls models`; do + (cd models/$m && \ + sed -i "s/max_candidate_sequences:.*/max_candidate_sequences:4/" config.pbtxt && \ + sed -i "s/max_queue_delay_microseconds:.*/max_queue_delay_microseconds:5000000/" config.pbtxt) +done + +export NO_BATCHING=0 +for i in \ + test_multi_sequence_different_shape \ + test_multi_sequence \ + test_multi_parallel_sequence \ + test_backlog \ + test_backlog_fill \ + test_backlog_fill_no_end \ + test_backlog_sequence_timeout \ + ; do + + SERVER_LOG="./$i.server.log" + SERVER_ARGS="--model-repository=`pwd`/models" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $BATCHER_TEST DynaSequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $IMPLICIT_STATE == "0" ]; then + # Ragged-batch tests that require max_queue_delay_microseconds to be + # non-zero so that batching is delayed until a full preferred batch is + # available. + for m in `ls ragged_models`; do + (cd ragged_models/$m && \ + sed -i "s/max_candidate_sequences:.*/max_candidate_sequences:4/" config.pbtxt && \ + sed -i "s/max_queue_delay_microseconds:.*/max_queue_delay_microseconds:5000000/" config.pbtxt) + done + + export NO_BATCHING=0 + for i in \ + test_multi_sequence_different_shape_allow_ragged \ + ; do + + SERVER_LOG="./$i.server.log" + SERVER_ARGS="--model-repository=`pwd`/ragged_models" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $BATCHER_TEST DynaSequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + done +fi + +# python unittest seems to swallow ImportError and still return 0 exit +# code. So need to explicitly check CLIENT_LOG to make sure we see +# some running tests +grep -c "HTTPSocketPoolResponse status=200" $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed To Run\n***" + RET=1 +fi + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py new file mode 100644 index 0000000000..17c406b18e --- /dev/null +++ b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py @@ -0,0 +1,63 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + + for request in requests: + json_string = ( + pb_utils.get_input_tensor_by_name(request, "EXPECTED_HEADERS") + .as_numpy()[0] + .decode("utf-8") + ) + expected_headers = json.loads(json_string) + + success = True + if request.parameters() != "": + parameters = json.loads(request.parameters()) + for key, value in expected_headers.items(): + if key in parameters: + if parameters[key] != value: + success = False + else: + success = False + + test_success = pb_utils.Tensor( + "TEST_SUCCESS", np.array([success], dtype=bool) + ) + inference_response = pb_utils.InferenceResponse( + output_tensors=[test_success] + ) + responses.append(inference_response) + + return responses diff --git a/qa/L0_grpc/client_plugin_models/client_plugin_test/config.pbtxt b/qa/L0_grpc/client_plugin_models/client_plugin_test/config.pbtxt new file mode 100644 index 0000000000..1bf368f795 --- /dev/null +++ b/qa/L0_grpc/client_plugin_models/client_plugin_test/config.pbtxt @@ -0,0 +1,45 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "client_plugin_test" +backend: "python" + +input [ + { + name: "EXPECTED_HEADERS" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "TEST_SUCCESS" + data_type: TYPE_BOOL + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_grpc/grpc_basic_auth_test.py b/qa/L0_grpc/grpc_basic_auth_test.py new file mode 100755 index 0000000000..07d29ef5b7 --- /dev/null +++ b/qa/L0_grpc/grpc_basic_auth_test.py @@ -0,0 +1,66 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import sys +import unittest + +sys.path.append("../common") + +import test_util as tu +import tritonclient.grpc as tritongrpcclient +import tritonclient.grpc.aio as asynctritongrpcclient +from tritonclient.grpc.aio.auth import BasicAuth as AsyncBasicAuth +from tritonclient.grpc.auth import BasicAuth + + +class GRPCBasicAuthTest(tu.TestResultCollector): + def setUp(self): + # Use the nginx port + self._client = tritongrpcclient.InferenceServerClient(url="localhost:8004") + self._client.register_plugin(BasicAuth("username", "password")) + + def test_client_call(self): + self.assertTrue(self._client.is_server_live()) + + def tearDown(self): + self._client.close() + + +class GRPCBasicAuthAsyncTest(unittest.IsolatedAsyncioTestCase): + async def asyncSetUp(self): + # Use the nginx port + self._client = asynctritongrpcclient.InferenceServerClient(url="localhost:8004") + self._client.register_plugin(AsyncBasicAuth("username", "password")) + + async def test_client_call(self): + self.assertTrue(await self._client.is_server_live()) + + async def asyncTearDown(self): + await self._client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_grpc/grpc_client_plugin_test.py b/qa/L0_grpc/grpc_client_plugin_test.py new file mode 100755 index 0000000000..1cc8c474ef --- /dev/null +++ b/qa/L0_grpc/grpc_client_plugin_test.py @@ -0,0 +1,120 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as tritongrpcclient +import tritonclient.grpc.aio as asynctritongrpcclient +from tritonclient.grpc import InferenceServerClientPlugin +from tritonclient.utils import np_to_triton_dtype + + +# A simple plugin that adds headers to the inference request. +class TestPlugin(InferenceServerClientPlugin): + def __init__(self, headers): + self._headers = headers + + def __call__(self, request): + request.headers.update(self._headers) + + +def prepare_infer_inputs(headers): + expected_headers = np.array([json.dumps(headers)], dtype=object) + inputs = [] + inputs.append( + tritongrpcclient.InferInput( + "EXPECTED_HEADERS", + expected_headers.shape, + np_to_triton_dtype(expected_headers.dtype), + ) + ) + inputs[0].set_data_from_numpy(expected_headers) + + return inputs + + +class GRPCClientPluginAsyncTest(unittest.IsolatedAsyncioTestCase): + async def asyncSetUp(self): + self._headers = {"my-key": "my-value"} + self._plugin = TestPlugin(self._headers) + self._client = asynctritongrpcclient.InferenceServerClient(url="localhost:8001") + + async def test_simple_infer(self): + model = "client_plugin_test" + inputs = prepare_infer_inputs(self._headers) + self._client.register_plugin(self._plugin) + response = await self._client.infer(model_name=model, inputs=inputs) + test_success = response.as_numpy("TEST_SUCCESS") + self.assertEqual(test_success, True) + + self._client.unregister_plugin() + inputs = prepare_infer_inputs({}) + response = await self._client.infer(model_name=model, inputs=inputs) + test_success = response.as_numpy("TEST_SUCCESS") + self.assertEqual(test_success, True) + + async def asyncTearDown(self): + await self._client.close() + + +class GRPCClientPluginTest(tu.TestResultCollector): + def setUp(self): + self._headers = {"my-key": "my-value"} + self._plugin = TestPlugin(self._headers) + self._client = tritongrpcclient.InferenceServerClient(url="localhost:8001") + + def test_simple_infer(self): + # Set the binary data to False so that 'Inference-Header-Length' is not + # added to the headers. + model = "client_plugin_test" + inputs = prepare_infer_inputs(self._headers) + self._client.register_plugin(self._plugin) + self.assertEqual(self._plugin, self._client.plugin()) + response = self._client.infer(model_name=model, inputs=inputs) + test_success = response.as_numpy("TEST_SUCCESS") + self.assertEqual(test_success, True) + + # Unregister the plugin + inputs = prepare_infer_inputs({}) + self._client.unregister_plugin() + self.assertEqual(None, self._client.plugin()) + response = self._client.infer(model_name=model, inputs=inputs) + test_success = response.as_numpy("TEST_SUCCESS") + self.assertEqual(test_success, True) + + def tearDown(self): + self._client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_grpc/nginx.conf b/qa/L0_grpc/nginx.conf new file mode 100644 index 0000000000..063d358c21 --- /dev/null +++ b/qa/L0_grpc/nginx.conf @@ -0,0 +1,54 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +worker_processes 1; + +error_log /var/log/nginx/error.log; + +events { + worker_connections 1024; +} + +http { + # Configure basic authentication + auth_basic "Restricted Content"; + auth_basic_user_file /opt/tritonserver/qa/L0_grpc/pswd; + + # Define upstream server + upstream backend { + server localhost:8001; + } + + # Define server block for reverse proxy + server { + listen 8004 http2; + + # Configure location for reverse proxy + location / { + grpc_pass grpc://backend; + } + } +} diff --git a/qa/L0_grpc/python_grpc_aio_test.py b/qa/L0_grpc/python_grpc_aio_test.py new file mode 100755 index 0000000000..ba43b36abb --- /dev/null +++ b/qa/L0_grpc/python_grpc_aio_test.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import tritonclient.grpc.aio as grpcclient +from tritonclient.utils import * + + +class TestGrpcAioClient(unittest.IsolatedAsyncioTestCase): + """Test if aio rpc can reach the server""" + + def setUp(self): + self._triton_client = grpcclient.InferenceServerClient(url="localhost:8001") + + async def asyncTearDown(self): + await self._triton_client.close() + + async def test_is_server_live(self): + ret = await self._triton_client.is_server_live() + self.assertEqual(ret, True) + + async def test_is_server_ready(self): + ret = await self._triton_client.is_server_ready() + self.assertEqual(ret, True) + + async def test_is_model_ready(self): + ret = await self._triton_client.is_model_ready("simple") + self.assertEqual(ret, True) + + async def test_get_server_metadata(self): + ret = await self._triton_client.get_server_metadata() + self.assertEqual(ret.name, "triton") + + ret = await self._triton_client.get_server_metadata(as_json=True) + self.assertEqual(ret["name"], "triton") + + async def test_get_model_metadata(self): + ret = await self._triton_client.get_model_metadata("simple") + self.assertEqual(ret.name, "simple") + + async def test_get_model_config(self): + ret = await self._triton_client.get_model_config("simple") + self.assertEqual(ret.config.name, "simple") + + async def test_get_model_repository_index(self): + ret = await self._triton_client.get_model_repository_index() + self.assertEqual(len(ret.models), 8) + + async def test_load_model(self): + with self.assertRaisesRegex( + InferenceServerException, + "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled", + ): + await self._triton_client.load_model("simple") + + async def test_unload_model(self): + with self.assertRaisesRegex( + InferenceServerException, + "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled", + ): + await self._triton_client.load_model("simple") + + async def test_get_inference_statistics(self): + await self._triton_client.get_inference_statistics() + + async def test_update_trace_settings(self): + await self._triton_client.update_trace_settings() + + async def test_get_trace_settings(self): + await self._triton_client.get_trace_settings() + + async def test_get_system_shared_memory_status(self): + await self._triton_client.get_system_shared_memory_status() + + async def test_register_system_shared_memory(self): + with self.assertRaisesRegex( + InferenceServerException, + "\[StatusCode\.INTERNAL\] Unable to open shared memory region: ''", + ): + await self._triton_client.register_system_shared_memory("", "", 0) + + async def test_unregister_system_shared_memory(self): + await self._triton_client.unregister_system_shared_memory() + + async def test_get_cuda_shared_memory_status(self): + await self._triton_client.get_cuda_shared_memory_status() + + async def test_register_cuda_shared_memory(self): + with self.assertRaisesRegex( + InferenceServerException, + "failed to register shared memory region.*invalid args", + ): + await self._triton_client.register_cuda_shared_memory("", b"", 0, 0) + + async def test_unregister_cuda_shared_memory(self): + await self._triton_client.unregister_cuda_shared_memory() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_grpc/python_unit_test.py b/qa/L0_grpc/python_unit_test.py new file mode 100755 index 0000000000..9591d4274c --- /dev/null +++ b/qa/L0_grpc/python_unit_test.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import queue +import time +import unittest + +# For stream infer test +from functools import partial + +import numpy as np +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class RestrictedProtocolTest(unittest.TestCase): + def setUp(self): + self.client_ = grpcclient.InferenceServerClient(url="localhost:8001") + self.model_name_ = "simple" + self.prefix_ = "triton-grpc-protocol-" + + # Other unspecified protocols should not be restricted + def test_sanity(self): + self.client_.get_inference_statistics("simple") + self.client_.get_inference_statistics( + "simple", headers={self.prefix_ + "infer-key": "infer-value"} + ) + + # health, infer, model repository protocols are restricted. + # health and infer expects "triton-grpc-restricted-infer-key : infer-value" header, + # model repository expected "triton-grpc-restricted-admin-key : admin-value". + def test_model_repository(self): + with self.assertRaisesRegex( + InferenceServerException, "This protocol is restricted" + ): + self.client_.unload_model( + self.model_name_, headers={self.prefix_ + "infer-key": "infer-value"} + ) + # Request go through and get actual transaction error + with self.assertRaisesRegex( + InferenceServerException, "explicit model load / unload is not allowed" + ): + self.client_.unload_model( + self.model_name_, headers={self.prefix_ + "admin-key": "admin-value"} + ) + + def test_health(self): + with self.assertRaisesRegex( + InferenceServerException, "This protocol is restricted" + ): + self.client_.is_server_live() + self.client_.is_server_live({self.prefix_ + "infer-key": "infer-value"}) + + def test_infer(self): + # setup + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + inputs[0].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32)) + inputs[1].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32)) + + # This test only care if the request goes through + with self.assertRaisesRegex( + InferenceServerException, "This protocol is restricted" + ): + _ = self.client_.infer( + model_name=self.model_name_, inputs=inputs, headers={"test": "1"} + ) + self.client_.infer( + model_name=self.model_name_, + inputs=inputs, + headers={self.prefix_ + "infer-key": "infer-value"}, + ) + + def test_stream_infer(self): + # setup + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + inputs[0].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32)) + inputs[1].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32)) + user_data = UserData() + # The server can't interfere with whether GRPC should create the stream, + # server will be notified after the stream is established and only + # until then be able to access metadata to decide whether to continue + # the stream. + # So on client side, it will always perceive that the stream is + # successfully created and can only check its health at a later time. + self.client_.start_stream(partial(callback, user_data), headers={"test": "1"}) + # wait for sufficient round-trip time + time.sleep(1) + with self.assertRaisesRegex( + InferenceServerException, "The stream is no longer in valid state" + ): + self.client_.async_stream_infer(model_name=self.model_name_, inputs=inputs) + # callback should record error detail + self.assertFalse(user_data._completed_requests.empty()) + with self.assertRaisesRegex( + InferenceServerException, "This protocol is restricted" + ): + raise user_data._completed_requests.get() + + self.assertTrue(user_data._completed_requests.empty()) + + # Stop and start new stream with proper header + self.client_.stop_stream() + self.client_.start_stream( + partial(callback, user_data), + headers={self.prefix_ + "infer-key": "infer-value"}, + ) + self.client_.async_stream_infer(model_name=self.model_name_, inputs=inputs) + # wait for response + time.sleep(1) + self.assertFalse(user_data._completed_requests.empty()) + self.assertNotEqual( + type(user_data._completed_requests.get()), InferenceServerException + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_grpc/test.sh b/qa/L0_grpc/test.sh new file mode 100755 index 0000000000..93d22e75be --- /dev/null +++ b/qa/L0_grpc/test.sh @@ -0,0 +1,687 @@ +#!/bin/bash +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +CLIENT_PLUGIN_TEST="./grpc_client_plugin_test.py" +BASIC_AUTH_TEST="./grpc_basic_auth_test.py" +NGINX_CONF="./nginx.conf" +# On windows the paths invoked by the script (running in WSL) must use +# /mnt/c when needed but the paths on the tritonserver command-line +# must be C:/ style. +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then + SDKDIR=${SDKDIR:=C:/sdk} + MODELDIR=${MODELDIR:=C:/models} + CLIENT_PLUGIN_MODELDIR=${MODELDIR:=C:/client_plugin_models} + DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} + BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} + SERVER=${SERVER:=/mnt/c/tritonserver/bin/tritonserver.exe} + + SIMPLE_AIO_INFER_CLIENT_PY=${SDKDIR}/python/simple_grpc_aio_infer_client.py + SIMPLE_AIO_STREAM_INFER_CLIENT_PY=${SDKDIR}/python/simple_grpc_aio_sequence_stream_infer_client.py + SIMPLE_HEALTH_CLIENT_PY=${SDKDIR}/python/simple_grpc_health_metadata.py + SIMPLE_INFER_CLIENT_PY=${SDKDIR}/python/simple_grpc_infer_client.py + SIMPLE_ASYNC_INFER_CLIENT_PY=${SDKDIR}/python/simple_grpc_async_infer_client.py + SIMPLE_STRING_INFER_CLIENT_PY=${SDKDIR}/python/simple_grpc_string_infer_client.py + SIMPLE_STREAM_INFER_CLIENT_PY=${SDKDIR}/python/simple_grpc_sequence_stream_infer_client.py + SIMPLE_SEQUENCE_INFER_CLIENT_PY=${SDKDIR}/python/simple_grpc_sequence_sync_infer_client.py + SIMPLE_IMAGE_CLIENT_PY=${SDKDIR}/python/image_client.py + # SIMPLE_ENSEMBLE_IMAGE_CLIENT_PY=${SDKDIR}/python/ensemble_image_client.py + SIMPLE_SHM_STRING_CLIENT_PY=${SDKDIR}/python/simple_grpc_shm_string_client.py + SIMPLE_SHM_CLIENT_PY=${SDKDIR}/python/simple_grpc_shm_client.py + SIMPLE_CUDASHM_CLIENT_PY=${SDKDIR}/python/simple_grpc_cudashm_client.py + SIMPLE_MODEL_CONTROL_PY=${SDKDIR}/python/simple_grpc_model_control.py + SIMPLE_REUSE_INFER_OBJECTS_CLIENT_PY=${SDKDIR}/python/reuse_infer_objects_client.py + SIMPLE_KEEPALIVE_CLIENT_PY=${SDKDIR}/python/simple_grpc_keepalive_client.py + SIMPLE_CUSTOM_ARGS_CLIENT_PY=${SDKDIR}/python/simple_grpc_custom_args_client.py + EXPLICIT_BYTE_CONTENT_CLIENT_PY=${SDKDIR}/python/grpc_explicit_byte_content_client.py + EXPLICIT_INT_CONTENT_CLIENT_PY=${SDKDIR}/python/grpc_explicit_int_content_client.py + EXPLICIT_INT8_CONTENT_CLIENT_PY=${SDKDIR}/python/grpc_explicit_int8_content_client.py + GRPC_CLIENT_PY=${SDKDIR}/python/grpc_client.py + GRPC_IMAGE_CLIENT_PY=${SDKDIR}/python/grpc_image_client.py + + SIMPLE_HEALTH_CLIENT=${SDKDIR}/python/simple_grpc_health_metadata + SIMPLE_INFER_CLIENT=${SDKDIR}/python/simple_grpc_infer_client + SIMPLE_STRING_INFER_CLIENT=${SDKDIR}/python/simple_grpc_string_infer_client + SIMPLE_ASYNC_INFER_CLIENT=${SDKDIR}/python/simple_grpc_async_infer_client + SIMPLE_MODEL_CONTROL=${SDKDIR}/python/simple_grpc_model_control + SIMPLE_STREAM_INFER_CLIENT=${SDKDIR}/python/simple_grpc_sequence_stream_infer_client + SIMPLE_SEQUENCE_INFER_CLIENT=${SDKDIR}/python/simple_grpc_sequence_sync_infer_client + SIMPLE_SHM_CLIENT=${SDKDIR}/python/simple_grpc_shm_client + SIMPLE_CUDASHM_CLIENT=${SDKDIR}/python/simple_grpc_cudashm_client + SIMPLE_IMAGE_CLIENT=${SDKDIR}/python/image_client + # SIMPLE_ENSEMBLE_IMAGE_CLIENT=${SDKDIR}/python/ensemble_image_client + SIMPLE_REUSE_INFER_OBJECTS_CLIENT=${SDKDIR}/python/reuse_infer_objects_client + SIMPLE_KEEPALIVE_CLIENT=${SDKDIR}/python/simple_grpc_keepalive_client + SIMPLE_CUSTOM_ARGS_CLIENT=${SDKDIR}/python/simple_grpc_custom_args_client + # [FIXME] point to proper client + CC_UNIT_TEST=${SDKDIR}/python/cc_client_test +else + MODELDIR=${MODELDIR:=`pwd`/models} + CLIENT_PLUGIN_MODELDIR=${CLIENTPLUGINMODELDIR:=`pwd`/client_plugin_models} + DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} + SERVER=${TRITON_DIR}/bin/tritonserver + BACKEND_DIR=${TRITON_DIR}/backends + + SIMPLE_AIO_INFER_CLIENT_PY=../clients/simple_grpc_aio_infer_client.py + SIMPLE_AIO_STREAM_INFER_CLIENT_PY=../clients/simple_grpc_aio_sequence_stream_infer_client.py + SIMPLE_HEALTH_CLIENT_PY=../clients/simple_grpc_health_metadata.py + SIMPLE_INFER_CLIENT_PY=../clients/simple_grpc_infer_client.py + SIMPLE_ASYNC_INFER_CLIENT_PY=../clients/simple_grpc_async_infer_client.py + SIMPLE_STRING_INFER_CLIENT_PY=../clients/simple_grpc_string_infer_client.py + SIMPLE_STREAM_INFER_CLIENT_PY=../clients/simple_grpc_sequence_stream_infer_client.py + SIMPLE_SEQUENCE_INFER_CLIENT_PY=../clients/simple_grpc_sequence_sync_infer_client.py + SIMPLE_IMAGE_CLIENT_PY=../clients/image_client.py + # SIMPLE_ENSEMBLE_IMAGE_CLIENT_PY=../clients/ensemble_image_client.py + SIMPLE_SHM_STRING_CLIENT_PY=../clients/simple_grpc_shm_string_client.py + SIMPLE_SHM_CLIENT_PY=../clients/simple_grpc_shm_client.py + SIMPLE_CUDASHM_CLIENT_PY=../clients/simple_grpc_cudashm_client.py + SIMPLE_MODEL_CONTROL_PY=../clients/simple_grpc_model_control.py + SIMPLE_REUSE_INFER_OBJECTS_CLIENT_PY=../clients/reuse_infer_objects_client.py + SIMPLE_KEEPALIVE_CLIENT_PY=../clients/simple_grpc_keepalive_client.py + SIMPLE_CUSTOM_ARGS_CLIENT_PY=../clients/simple_grpc_custom_args_client.py + EXPLICIT_BYTE_CONTENT_CLIENT_PY=../clients/grpc_explicit_byte_content_client.py + EXPLICIT_INT_CONTENT_CLIENT_PY=../clients/grpc_explicit_int_content_client.py + EXPLICIT_INT8_CONTENT_CLIENT_PY=../clients/grpc_explicit_int8_content_client.py + GRPC_CLIENT_PY=../clients/grpc_client.py + GRPC_IMAGE_CLIENT_PY=../clients/grpc_image_client.py + + SIMPLE_HEALTH_CLIENT=../clients/simple_grpc_health_metadata + SIMPLE_INFER_CLIENT=../clients/simple_grpc_infer_client + SIMPLE_STRING_INFER_CLIENT=../clients/simple_grpc_string_infer_client + SIMPLE_ASYNC_INFER_CLIENT=../clients/simple_grpc_async_infer_client + SIMPLE_MODEL_CONTROL=../clients/simple_grpc_model_control + SIMPLE_STREAM_INFER_CLIENT=../clients/simple_grpc_sequence_stream_infer_client + SIMPLE_SEQUENCE_INFER_CLIENT=../clients/simple_grpc_sequence_sync_infer_client + SIMPLE_SHM_CLIENT=../clients/simple_grpc_shm_client + SIMPLE_CUDASHM_CLIENT=../clients/simple_grpc_cudashm_client + SIMPLE_IMAGE_CLIENT=../clients/image_client + # SIMPLE_ENSEMBLE_IMAGE_CLIENT=../clients/ensemble_image_client + SIMPLE_REUSE_INFER_OBJECTS_CLIENT=../clients/reuse_infer_objects_client + SIMPLE_KEEPALIVE_CLIENT=../clients/simple_grpc_keepalive_client + SIMPLE_CUSTOM_ARGS_CLIENT=../clients/simple_grpc_custom_args_client + CC_UNIT_TEST=../clients/cc_client_test +fi +PYTHON_UNIT_TEST=python_unit_test.py + +# Add string_dyna_sequence model to repo +cp -r ${MODELDIR}/simple_dyna_sequence ${MODELDIR}/simple_string_dyna_sequence +sed -i "s/simple_dyna_sequence/simple_string_dyna_sequence/g" ${MODELDIR}/simple_string_dyna_sequence/config.pbtxt +sed -i "s/^platform: .*/backend: \"dyna_sequence\"/g" ${MODELDIR}/simple_string_dyna_sequence/config.pbtxt +sed -i "/CONTROL_SEQUENCE_CORRID/{n;s/data_type:.*/data_type: TYPE_STRING/}" ${MODELDIR}/simple_string_dyna_sequence/config.pbtxt +rm -f ${MODELDIR}/simple_string_dyna_sequence/1/model.graphdef +cp ../custom_models/custom_dyna_sequence_int32/1/libtriton_dyna_sequence.so ${MODELDIR}/simple_string_dyna_sequence/1/ + +rm -f *.log +rm -f *.log.* + +set -e + +CLIENT_LOG=`pwd`/client.log +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" +source ../common/util.sh + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $SIMPLE_HEALTH_CLIENT_PY -v >> ${CLIENT_LOG}.health 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.health + RET=1 +fi + +IMAGE=../images/vulture.jpeg +for i in \ + $SIMPLE_AIO_INFER_CLIENT_PY \ + $SIMPLE_AIO_STREAM_INFER_CLIENT_PY \ + $SIMPLE_INFER_CLIENT_PY \ + $SIMPLE_ASYNC_INFER_CLIENT_PY \ + $SIMPLE_STRING_INFER_CLIENT_PY \ + $SIMPLE_IMAGE_CLIENT_PY \ + $SIMPLE_ENSEMBLE_IMAGE_CLIENT_PY \ + $SIMPLE_STREAM_INFER_CLIENT_PY \ + $SIMPLE_SEQUENCE_INFER_CLIENT_PY \ + $SIMPLE_SHM_STRING_CLIENT_PY \ + $SIMPLE_SHM_CLIENT_PY \ + $SIMPLE_CUDASHM_CLIENT_PY \ + $SIMPLE_KEEPALIVE_CLIENT_PY \ + $SIMPLE_CUSTOM_ARGS_CLIENT_PY \ + $EXPLICIT_BYTE_CONTENT_CLIENT_PY \ + $EXPLICIT_INT_CONTENT_CLIENT_PY \ + $EXPLICIT_INT8_CONTENT_CLIENT_PY \ + $GRPC_CLIENT_PY \ + $GRPC_IMAGE_CLIENT_PY \ + ; do + BASE=$(basename -- $i) + SUFFIX="${BASE%.*}" + EXTRA_ARGS="" + if [ $SUFFIX == "image_client" ]; then + EXTRA_ARGS="-i grpc -u localhost:8001" + fi + if [[ ($SUFFIX == "image_client") || ($SUFFIX == "grpc_image_client") ]]; then + python $i -m inception_graphdef -s INCEPTION -a -c 1 -b 1 $EXTRA_ARGS $IMAGE >> "${CLIENT_LOG}.async.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.async.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.async.${SUFFIX} + RET=1 + fi + python $i -m inception_graphdef -s INCEPTION -a --streaming -c 1 -b 1 $EXTRA_ARGS $IMAGE >> "${CLIENT_LOG}.streaming.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.streaming.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.streaming.${SUFFIX} + RET=1 + fi + python $i -m inception_graphdef -s INCEPTION -c 1 -b 1 $EXTRA_ARGS $IMAGE >> "${CLIENT_LOG}.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.${SUFFIX} + RET=1 + fi + # elif [ $SUFFIX == "ensemble_image_client" ]; then + # python $i -c 1 $EXTRA_ARGS ../images >> "${CLIENT_LOG}.${SUFFIX}" 2>&1 + # for result in "SPORTS CAR" "COFFEE MUG" "VULTURE"; do + # if [ `grep -c "$result" ${CLIENT_LOG}.${SUFFIX}` != "1" ]; then + # echo -e "\n***\n*** Failed. Expected 1 $result result\n***" + # RET=1 + # fi + # done + else + python $i -v >> "${CLIENT_LOG}.${SUFFIX}" 2>&1 + fi + + if [ $? -ne 0 ]; then + cat "${CLIENT_LOG}.${SUFFIX}" + RET=1 + fi + + if [ $(cat "${CLIENT_LOG}.${SUFFIX}" | grep "PASS" | wc -l) -ne 1 ]; then + cat "${CLIENT_LOG}.${SUFFIX}" + RET=1 + fi +done + +# Test while reusing the InferInput and InferRequestedOutput objects +$SIMPLE_REUSE_INFER_OBJECTS_CLIENT_PY -v -i grpc -u localhost:8001 >> ${CLIENT_LOG}.reuse 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.reuse + RET=1 +fi + +for i in \ + $SIMPLE_INFER_CLIENT \ + $SIMPLE_STRING_INFER_CLIENT \ + $SIMPLE_ASYNC_INFER_CLIENT \ + $SIMPLE_HEALTH_CLIENT \ + $SIMPLE_STREAM_INFER_CLIENT \ + $SIMPLE_SEQUENCE_INFER_CLIENT \ + $SIMPLE_SHM_CLIENT \ + $SIMPLE_CUDASHM_CLIENT \ + $SIMPLE_IMAGE_CLIENT \ + $SIMPLE_ENSEMBLE_IMAGE_CLIENT \ + $SIMPLE_KEEPALIVE_CLIENT \ + $SIMPLE_CUSTOM_ARGS_CLIENT \ + ; do + BASE=$(basename -- $i) + SUFFIX="${BASE%.*}" + if [[ $SUFFIX == "image_client" ]]; then + $i -m inception_graphdef -s INCEPTION -a -c 1 -b 1 -i grpc -u localhost:8001 $IMAGE >> "${CLIENT_LOG}.c++.async.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.c++.async.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.c++.${SUFFIX} + RET=1 + fi + $i -m inception_graphdef -s INCEPTION -a --streaming -c 1 -b 1 -i grpc -u localhost:8001 $IMAGE >> "${CLIENT_LOG}.c++.streaming.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.c++.streaming.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.c++.${SUFFIX} + RET=1 + fi + $i -m inception_graphdef -s INCEPTION -c 1 -b 1 -i grpc -u localhost:8001 $IMAGE >> "${CLIENT_LOG}.c++.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.c++.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.c++.${SUFFIX} + RET=1 + fi + # elif [ $SUFFIX == "ensemble_image_client" ]; then + # $i -c 1 -i grpc -u localhost:8001 ../images >> "${CLIENT_LOG}.c++.${SUFFIX}" 2>&1 + # for result in "SPORTS CAR" "COFFEE MUG" "VULTURE"; do + # if [ `grep -c "$result" ${CLIENT_LOG}.c++.${SUFFIX}` != "1" ]; then + # echo -e "\n***\n*** Failed. Expected 1 $result result\n***" + # RET=1 + # fi + # done + elif [[ $BASE == "simple_grpc_infer_client" ]]; then + # Test forcing new channel creation with simple infer client + NEW_CHANNEL_STRING="new connected subchannel" + CACHED_CHANNEL_STRING_NONE="There are 0 cached channels" + CACHED_CHANNEL_STRING_ONE="There are 1 cached channel" + GRPC_TRACE=subchannel GRPC_VERBOSITY=info $i -v -c "true" >> ${CLIENT_LOG}.c++.${SUFFIX} 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.${SUFFIX} + RET=1 + fi + NUM_NEW_CHANNEL_CALLS=`grep -c "${NEW_CHANNEL_STRING}" ${CLIENT_LOG}.c++.${SUFFIX}` + if [ $NUM_NEW_CHANNEL_CALLS != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 ${NEW_CHANNEL_STRING} calls but got ${NUM_NEW_CHANNEL_CALLS}\n***" + cat $CLIENT_LOG.c++.${SUFFIX} + RET=1 + fi + if [ `grep -c "${CACHED_CHANNEL_STRING_ONE}" ${CLIENT_LOG}.c++.${SUFFIX}` != "2" ]; then + echo -e "\n***\n*** Failed. Expected 1 cached channel\n***" + cat $CLIENT_LOG.c++.${SUFFIX} + RET=1 + fi + GRPC_TRACE=subchannel GRPC_VERBOSITY=info $i -v -c "false" >> ${CLIENT_LOG}.c++.${SUFFIX} 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.${SUFFIX} + RET=1 + fi + NUM_NEW_CHANNEL_CALLS=`grep -c "${NEW_CHANNEL_STRING}" ${CLIENT_LOG}.c++.${SUFFIX}` + if [ $NUM_NEW_CHANNEL_CALLS != "3" ]; then + echo -e "\n***\n*** Failed. Expected 2 ${NEW_CHANNEL_STRING} calls but got ${NUM_NEW_CHANNEL_CALLS}\n***" + cat $CLIENT_LOG.c++.${SUFFIX} + RET=1 + fi + if [ `grep -c "${CACHED_CHANNEL_STRING_NONE}" ${CLIENT_LOG}.c++.${SUFFIX}` != "2" ]; then + echo -e "\n***\n*** Failed. Expected 0 cached channels\n***" + cat $CLIENT_LOG.c++.${SUFFIX} + RET=1 + fi + else + $i -v -H test:1 >> ${CLIENT_LOG}.c++.${SUFFIX} 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.${SUFFIX} + RET=1 + fi + fi +done + +# Test while reusing the InferInput and InferRequestedOutput objects +$SIMPLE_REUSE_INFER_OBJECTS_CLIENT -v -i grpc -u localhost:8001 >> ${CLIENT_LOG}.c++.reuse 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.reuse + RET=1 +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${CLIENT_PLUGIN_MODELDIR} --http-header-forward-pattern=.* --grpc-header-forward-pattern=.*" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 $CLIENT_PLUGIN_TEST >> ${CLIENT_LOG}.python.plugin 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.python.plugin + RET=1 +fi +set -e + +# Create a password file with username:password +echo -n 'username:' > pswd +echo "password" | openssl passwd -stdin -apr1 >> pswd +nginx -c `pwd`/$NGINX_CONF + +python3 $BASIC_AUTH_TEST +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.python.plugin.auth + RET=1 +fi +service nginx stop + +kill $SERVER_PID +wait $SERVER_PID + +export GRPC_TRACE=compression, channel +export GRPC_VERBOSITY=DEBUG +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --grpc-infer-response-compression-level=high" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +$SIMPLE_INFER_CLIENT -v -C deflate>> ${CLIENT_LOG}.c++.compress 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.compress + RET=1 +fi +if [ $(cat ${CLIENT_LOG}.c++.compress | grep "Compressed\[deflate\]" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}.c++.compress + RET=1 +fi + +python $SIMPLE_INFER_CLIENT_PY -v -C deflate>> ${CLIENT_LOG}.compress 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.compress + RET=1 +fi +if [ $(cat ${CLIENT_LOG}.compress | grep "Compressed\[deflate\]" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}.compress + RET=1 +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID + +unset GRPC_TRACE +unset GRPC_VERBOSITY + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --model-control-mode=explicit" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Test Model Control API +python $SIMPLE_MODEL_CONTROL_PY -v >> ${CLIENT_LOG}.model_control 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.model_control + RET=1 +fi + +if [ $(cat ${CLIENT_LOG}.model_control | grep "PASS" | wc -l) -ne 1 ]; then + cat ${CLIENT_LOG}.model_control + RET=1 +fi +if [ $(cat ${SERVER_LOG} | grep "Invalid config override" | wc -l) -eq 0 ]; then + cat ${SERVER_LOG} + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --model-control-mode=explicit" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Test Model Control API +$SIMPLE_MODEL_CONTROL -v >> ${CLIENT_LOG}.c++.model_control 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.model_control + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test with dynamic sequence models +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server_dyna.log" +CLIENT_LOG="./client_dyna.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e + +for i in \ + $SIMPLE_STREAM_INFER_CLIENT_PY \ + $SIMPLE_SEQUENCE_INFER_CLIENT_PY \ + $SIMPLE_STREAM_INFER_CLIENT \ + $SIMPLE_SEQUENCE_INFER_CLIENT; do + + $i -v -d >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Run cpp client unit test +rm -rf unit_test_models && mkdir unit_test_models +cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/. +cp -r ${MODELDIR}/simple unit_test_models/. + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=unit_test_models + --trace-file=global_unittest.log --trace-level=TIMESTAMPS --trace-rate=1" +SERVER_LOG="./inference_server_cc_unit_test.log" +CLIENT_LOG="./cc_unit_test.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Run all unit tests except load +$CC_UNIT_TEST --gtest_filter=GRPC*:-*Load* >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG} + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Run cpp client load API unit test +rm -rf unit_test_models && mkdir unit_test_models +cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/. +# Make only version 2, 3 is valid version directory while config requests 1, 3 +rm -rf unit_test_models/onnx_int32_int32_int32/1 + +# Start with EXPLICIT mode and load onnx_float32_float32_float32 +SERVER_ARGS="--model-repository=`pwd`/unit_test_models \ + --model-control-mode=explicit \ + --load-model=onnx_int32_int32_int32 \ + --strict-model-config=false" +SERVER_LOG="./inference_server_cc_unit_test.load.log" +CLIENT_LOG="./cc_unit_test.load.log" + +for i in \ + "LoadWithFileOverride" \ + "LoadWithConfigOverride" \ + ; do + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + $CC_UNIT_TEST --gtest_filter=GRPC*$i >> ${CLIENT_LOG}.$i 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.$i + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Run python grpc aio unit test +PYTHON_GRPC_AIO_TEST=python_grpc_aio_test.py +CLIENT_LOG=`pwd`/python_grpc_aio_test.log +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e +python $PYTHON_GRPC_AIO_TEST > $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Python GRPC AsyncIO Test Failed\n***" + RET=1 +fi +set -e +kill $SERVER_PID +wait $SERVER_PID + +# Test GRPC health check implemented +go install github.com/grpc-ecosystem/grpc-health-probe@latest +HEALTH_PROBE="${GOPATH}/bin/grpc-health-probe -addr=localhost:8001" + +CLIENT_LOG=`pwd`/grpc_health_probe_offline.log +set +e +$HEALTH_PROBE > $CLIENT_LOG 2>&1 +set -e +if [ `grep -c "timeout: failed to connect service" ${CLIENT_LOG}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected health check timeout\n***" + cat $CLIENT_LOG + RET=1 +fi + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +CLIENT_LOG=`pwd`/grpc_health_probe_online.log +set +e +$HEALTH_PROBE > $CLIENT_LOG 2>&1 +set -e +if [ `grep -c "status: SERVING" ${CLIENT_LOG}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected health check to return SERVING\n***" + cat $CLIENT_LOG + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Repeated protocol, not allowed +SERVER_ARGS="--model-repository=${MODELDIR} \ + --grpc-restricted-protocol=model-repository,health:k1=v1 \ + --grpc-restricted-protocol=metadata,health:k2=v2" +run_server +EXPECTED_MSG="protocol 'health' can not be specified in multiple config groups" +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Expect fail to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +elif [ `grep -c "${EXPECTED_MSG}" ${SERVER_LOG}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected ${EXPECTED_MSG} to be found in log\n***" + cat $SERVER_LOG + RET=1 +fi + +# Unknown protocol, not allowed +SERVER_ARGS="--model-repository=${MODELDIR} \ + --grpc-restricted-protocol=model-reposit,health:k1=v1 \ + --grpc-restricted-protocol=metadata,health:k2=v2" +run_server +EXPECTED_MSG="unknown restricted protocol 'model-reposit'" +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Expect fail to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +elif [ `grep -c "${EXPECTED_MSG}" ${SERVER_LOG}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected ${EXPECTED_MSG} to be found in log\n***" + cat $SERVER_LOG + RET=1 +fi + +# Test restricted protocols +SERVER_ARGS="--model-repository=${MODELDIR} \ + --grpc-restricted-protocol=model-repository:admin-key=admin-value \ + --grpc-restricted-protocol=inference,health:infer-key=infer-value" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e +python $PYTHON_UNIT_TEST RestrictedProtocolTest > $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Python GRPC Restricted Protocol Test Failed\n***" + RET=1 +fi +set -e +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET + diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py new file mode 100755 index 0000000000..f7507747e9 --- /dev/null +++ b/qa/L0_grpc_state_cleanup/cleanup_test.py @@ -0,0 +1,663 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import queue +import signal +import time +import unittest +from functools import partial + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class UserData: + def __init__(self): + self._response_queue = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._response_queue.put(error) + else: + user_data._response_queue.put(result) + + +# These state cleanup tests relies on the test.sh +# to check whether all the created request objects +# were properly deleted by the sever. +# The purpose on these unittest is to exercise +# different portions of the gRPC frontend and +# and track the state objects. +class CleanUpTest(tu.TestResultCollector): + SERVER_PID = None + + def setUp(self): + self.decoupled_model_name_ = "repeat_int32" + self.identity_model_name_ = "custom_zero_1_float32" + self.repeat_non_decoupled_model_name = "repeat_int32_non_decoupled" + + def _prepare_inputs_and_outputs(self, kind): + if kind in ("decoupled_streaming", "non_decoupled_streaming"): + self.inputs_ = [] + self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32")) + self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32")) + self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32")) + + self.outputs_ = [] + self.outputs_.append(grpcclient.InferRequestedOutput("OUT")) + self.outputs_.append(grpcclient.InferRequestedOutput("IDX")) + self.requested_outputs_ = self.outputs_ + elif kind in ("simple", "streaming"): + self.inputs_ = [] + self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "FP32")) + + self.outputs_ = [] + self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0")) + self.requested_outputs_ = self.outputs_ + else: + raise ValueError("Unsupported kind specified to prepare inputs/outputs") + + def _simple_infer( + self, + request_count, + cancel_response_idx=None, + client_timeout_pair=None, + kill_server=None, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + self._prepare_inputs_and_outputs("simple") + + input_data = np.array([[1.0]], dtype=np.float32) + self.inputs_[0].set_data_from_numpy(input_data) + + user_data = UserData() + + futures = [] + timeout_idx = None + timeout_value = None + if client_timeout_pair: + timeout_idx, timeout_value = client_timeout_pair + for i in range(request_count): + if kill_server == i: + os.kill(int(self.SERVER_PID), signal.SIGINT) + this_timeout = None + if timeout_idx == i: + this_timeout = timeout_value + futures.append( + triton_client.async_infer( + model_name=self.identity_model_name_, + inputs=self.inputs_, + request_id=str(i), + callback=partial(callback, user_data), + outputs=self.requested_outputs_, + client_timeout=this_timeout, + ) + ) + + if cancel_response_idx is not None: + futures[cancel_response_idx].cancel() + + responses = [] + while len(responses) < len(futures): + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + responses.append(data_item) + + for response in responses: + output0_data = response.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(input_data, output0_data)) + + def _stream_infer_with_params( + self, + request_count, + request_delay, + _, + user_data, + result_dict, + delay_data=None, + delay_factor=None, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + self.inputs_[1].set_data_from_numpy(delay_data) + if kill_server == i: + os.kill(int(self.SERVER_PID), signal.SIGINT) + triton_client.async_stream_infer( + model_name=self.decoupled_model_name_, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + # Opt-in to receiving flags-only responses from model/backend + # to help detect final responses for decoupled models. + enable_empty_final_response=True, + ) + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + completed_requests = 0 + while completed_requests < request_count: + if cancel_response_idx == recv_count: + triton_client.stop_stream(cancel_requests=True) + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + response = data_item.get_response() + # Request IDs should generally be provided with each request + # to associate decoupled responses with their requests. + if not response.id: + raise ValueError( + "No response id found. Was a request_id provided?" + ) + + # Detect final response. Parameters are oneof and we expect bool_param + if response.parameters.get("triton_final_response").bool_param: + completed_requests += 1 + + # Only process non-empty response, ignore if empty (no outputs) + if response.outputs: + if response.id not in result_dict: + result_dict[response.id] = [] + result_dict[response.id].append((recv_count, data_item)) + recv_count += 1 + + def _stream_infer( + self, + request_count, + request_delay, + expected_count, + user_data, + result_dict, + delay_data=None, + delay_factor=None, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + model_name = self.identity_model_name_ + if delay_data is not None: + model_name = self.decoupled_model_name_ + self.inputs_[1].set_data_from_numpy(delay_data) + if kill_server == i: + os.kill(int(self.SERVER_PID), signal.SIGINT) + triton_client.async_stream_infer( + model_name=model_name, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + ) + if (delay_data is not None) and (delay_factor is not None): + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + while recv_count < expected_count: + if cancel_response_idx == recv_count: + triton_client.stop_stream(cancel_requests=True) + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + this_id = data_item.get_response().id + if this_id not in result_dict: + result_dict[this_id] = [] + result_dict[this_id].append((recv_count, data_item)) + + recv_count += 1 + + def _streaming_infer( + self, + request_count, + request_delay=0, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + should_error=True, + ): + self._prepare_inputs_and_outputs("streaming") + + input_data = np.array([[1.0]], dtype=np.float32) + self.inputs_[0].set_data_from_numpy(input_data) + + user_data = UserData() + result_dict = {} + + try: + expected_count = request_count + self._stream_infer( + request_count, + request_delay, + expected_count, + user_data, + result_dict, + cancel_response_idx=cancel_response_idx, + stream_timeout=stream_timeout, + kill_server=kill_server, + ) + except Exception as ex: + if cancel_response_idx or stream_timeout or should_error: + raise ex + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Validate the results.. + for i in range(request_count): + this_id = str(i) + if this_id not in result_dict.keys(): + self.assertTrue( + False, "response for request id {} not received".format(this_id) + ) + self.assertEqual(len(result_dict[this_id]), 1) + result = result_dict[this_id][0][1] + output0_data = result.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(input_data, output0_data)) + + def _decoupled_infer( + self, + request_count, + request_delay=0, + repeat_count=1, + data_offset=100, + delay_time=1000, + delay_factor=1, + wait_time=500, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + should_error=True, + infer_helper_map=[True, True], + ): + self._prepare_inputs_and_outputs(kind="decoupled_streaming") + + # Initialize data for IN + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + self.inputs_[0].set_shape([repeat_count]) + self.inputs_[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time + self.inputs_[1].set_shape([repeat_count]) + + # Initialize data for WAIT + wait_data = np.array([wait_time], dtype=np.uint32) + self.inputs_[2].set_data_from_numpy(wait_data) + + infer_helpers = [] + if infer_helper_map[0]: + infer_helpers.append(self._stream_infer) + if infer_helper_map[1]: + infer_helpers.append(self._stream_infer_with_params) + + for infer_helper in infer_helpers: + user_data = UserData() + result_dict = {} + + try: + expected_count = repeat_count * request_count + infer_helper( + request_count, + request_delay, + expected_count, + user_data, + result_dict, + delay_data, + delay_factor, + cancel_response_idx, + stream_timeout, + kill_server, + ) + except Exception as ex: + if cancel_response_idx or stream_timeout or should_error: + raise ex + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Validate the results.. + for i in range(request_count): + this_id = str(i) + if repeat_count != 0 and this_id not in result_dict.keys(): + self.assertTrue( + False, "response for request id {} not received".format(this_id) + ) + elif repeat_count == 0 and this_id in result_dict.keys(): + self.assertTrue( + False, + "received unexpected response for request id {}".format( + this_id + ), + ) + if repeat_count != 0: + self.assertEqual(len(result_dict[this_id]), repeat_count) + expected_data = data_offset + result_list = result_dict[this_id] + for j in range(len(result_list)): + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + this_idx = result_list[j][1].as_numpy("IDX") + self.assertEqual(len(this_idx), 1) + self.assertEqual(this_idx[0], j) + expected_data += 1 + + ### + ### Non-Streaming Tests + ### + def test_simple_infer(self): + # This test case sends 10 asynchronous requests and validates + # the response. + self._simple_infer(request_count=10) + + def test_simple_infer_cancellation(self): + # This test case is used to check whether all the states are + # correctly released when one of the request is cancelled from + # the client side. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=10, cancel_response_idx=5) + self.assertIn("Locally cancelled by application!", str(cm.exception)) + + def test_simple_infer_timeout(self): + # This test case is used to check whether all the states are + # correctly released when the request gets timed-out on the client. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=10, client_timeout_pair=[5, 0.1]) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + def test_simple_infer_error_status(self): + # This test case is used to check whether all the state objects are + # released when RPC runs into error. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=10) + self.assertIn( + "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", + str(cm.exception), + ) + + def test_simple_infer_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in the beginning + # of inference run with final parameters being returned. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=20, kill_server=5) + + ### + ### Streaming Tests + ### + def test_streaming_infer(self): + # Sanity test to check whether all the state objects + # are correctly released. Sends 10 requests in a single + # gRPC bidirectional stream. + self._streaming_infer(request_count=10) + + def test_streaming_cancellation(self): + # This test case is used to check whether all the states are + # correctly released when the stream is closed when fifth + # response is received. + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer(request_count=10, cancel_response_idx=5) + self.assertIn("Locally cancelled by application!", str(cm.exception)) + + def test_streaming_timeout(self): + # This test case is used to check whether all the states are + # released when some of the requests timeouts. + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer(request_count=10, request_delay=1, stream_timeout=2) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + def test_streaming_error_status(self): + # This test case is used to check whether all the state objects are + # released when RPC runs into error. + expected_exceptions = [ + "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", + "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.", + ] + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer(request_count=10, should_error=True) + + exception_match = False + for expected_exception in expected_exceptions: + exception_match |= expected_exception in str(cm.exception) + self.assertTrue( + exception_match, "Raised unexpected exception {}".format(str(cm.exception)) + ) + + def test_streaming_infer_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in middle of + # inference run. + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer( + request_count=10, + request_delay=1, + kill_server=5, + should_error=True, + ) + + ### + ### Decoupled Streaming Tests + ### + def test_decoupled_infer(self): + # Sanity test to check whether all the state objects + # are correctly released. Sends 10 requests in a single + # gRPC bidirectional stream and expects each of these + # requests to generate 10 responses. + self._decoupled_infer(request_count=10, repeat_count=10) + + def test_decoupled_cancellation(self): + # This test case is used to check whether all the states are + # correctly released when the stream is closed when fifth + # response is received. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, repeat_count=10, cancel_response_idx=5 + ) + self.assertIn("Locally cancelled by application!", str(cm.exception)) + + def test_decoupled_timeout(self): + # This test case is used to check whether all the states are + # released when some of the requests timeouts. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, repeat_count=10, request_delay=1, stream_timeout=2 + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + def test_decoupled_error_status(self): + # This test case is used to check whether all the state objects are + # released when RPC runs into error. + expected_exceptions = [ + "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", + "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.", + ] + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer(request_count=10, repeat_count=10, should_error=True) + + exception_match = False + for expected_exception in expected_exceptions: + exception_match |= expected_exception in str(cm.exception) + self.assertTrue( + exception_match, "Raised unexpected exception {}".format(str(cm.exception)) + ) + + def test_decoupled_infer_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in middle of + # inference run. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, + repeat_count=10, + request_delay=1, + kill_server=5, + should_error=True, + infer_helper_map=[True, False], + ) + + def test_decoupled_infer_with_params_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in middle of + # inference run with final parameters being returned. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, + repeat_count=10, + request_delay=1, + kill_server=5, + should_error=True, + infer_helper_map=[False, True], + ) + + def test_decoupled_infer_complete(self): + # Test if the Process() thread could release the state object before + # the StreamInferResponseComplete() thread is done accessing it. + self._decoupled_infer(request_count=1, repeat_count=1, stream_timeout=16) + # Check no error is printed to the log. + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Should not print this", server_log) + + def test_non_decoupled_streaming_multi_response(self): + # Test non-decoupled streaming infer with more than one response should return + # the first response. + response_count = 4 + expected_response_count = 1 + expected_response_index = 0 + + # Prepare input data + self._prepare_inputs_and_outputs("non_decoupled_streaming") + # Initialize data for IN + data_offset = 100 + input_data = np.arange( + start=data_offset, stop=data_offset + response_count, dtype=np.int32 + ) + self.inputs_[0].set_shape([response_count]) + self.inputs_[0].set_data_from_numpy(input_data) + # Initialize data for DELAY + delay_data = np.zeros([response_count], dtype=np.uint32) + self.inputs_[1].set_shape([response_count]) + self.inputs_[1].set_data_from_numpy(delay_data) + # Initialize data for WAIT + wait_data = np.array([0], dtype=np.uint32) + self.inputs_[2].set_data_from_numpy(wait_data) + + # Infer + user_data = UserData() + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as client: + # Establish stream + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + client.start_stream( + callback=partial(callback, user_data), + stream_timeout=16, + headers=metadata, + ) + else: + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16 + ) + # Send a request + client.async_stream_infer( + model_name=self.repeat_non_decoupled_model_name, + inputs=self.inputs_, + request_id="0", + outputs=self.requested_outputs_, + ) + # Wait for all results and stop stream + client.stop_stream() + + # Check infer output + actual_response_count = 0 + while not user_data._response_queue.empty(): + actual_response_count += 1 + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + response_idx = data_item.as_numpy("IDX")[0] + self.assertEqual(response_idx, expected_response_index) + self.assertEqual(actual_response_count, expected_response_count) + + +if __name__ == "__main__": + CleanUpTest.SERVER_PID = os.environ.get("SERVER_PID", CleanUpTest.SERVER_PID) + unittest.main() diff --git a/qa/L0_grpc_state_cleanup/test.sh b/qa/L0_grpc_state_cleanup/test.sh new file mode 100755 index 0000000000..df302d5ed1 --- /dev/null +++ b/qa/L0_grpc_state_cleanup/test.sh @@ -0,0 +1,235 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 +CLEANUP_TEST=cleanup_test.py + +rm -f *.log + +CLIENT_LOG=`pwd`/client.log +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +function check_state_release() { + local log_file=$1 + + num_state_release=`cat $log_file | grep "StateRelease" | wc -l` + num_state_new=`cat $log_file | grep "StateNew" | wc -l` + + if [ $num_state_release -ne $num_state_new ]; then + cat $log_file + echo -e "\n***\n*** Test Failed: Mismatch detected, $num_state_new state(s) created, $num_state_release state(s) released. \n***" >> $log_file + return 1 + fi + + return 0 +} + +rm -fr ./models/custom_zero_1_float32 && \ + cp -r ../custom_models/custom_zero_1_float32 ./models/. && \ + mkdir -p ./models/custom_zero_1_float32/1 + +(cd models/custom_zero_1_float32 && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"1000\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +rm -rf models/repeat_int32_non_decoupled && \ + cp -r models/repeat_int32 models/repeat_int32_non_decoupled && \ + (cd models/repeat_int32_non_decoupled && \ + sed -i "/model_transaction_policy/,+2d" config.pbtxt && \ + sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt) + +for i in test_simple_infer \ + test_simple_infer_cancellation \ + test_simple_infer_timeout \ + test_streaming_infer \ + test_streaming_timeout \ + test_streaming_cancellation \ + test_decoupled_infer \ + test_decoupled_cancellation \ + test_decoupled_timeout \ + test_non_decoupled_streaming_multi_response; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + set -e +done + + +for i in test_simple_infer_error_status \ + test_streaming_error_status \ + test_decoupled_error_status; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + + set -e +done + +for i in test_simple_infer_shutdownserver \ + test_streaming_infer_shutdownserver \ + test_decoupled_infer_shutdownserver \ + test_decoupled_infer_with_params_shutdownserver; do + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + SERVER_LOG="./inference_server.$i.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + + set -e +done + +TEST_NAME=test_decoupled_infer_complete +export TRITONSERVER_DELAY_GRPC_COMPLETE=2000 + +SERVER_LOG="./inference_server.$TEST_NAME.log" +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_NAME" >>$CLIENT_LOG + +set +e + +SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test $TEST_NAME Failed\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +check_state_release $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_http/generate_endpoint_test.py b/qa/L0_http/generate_endpoint_test.py new file mode 100755 index 0000000000..3eb0b6ea5f --- /dev/null +++ b/qa/L0_http/generate_endpoint_test.py @@ -0,0 +1,506 @@ +#!/usr/bin/python3 +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json +import threading +import time +import unittest + +import requests +import sseclient +import test_util as tu + + +class GenerateEndpointTest(tu.TestResultCollector): + def setUp(self): + self._model_name = "mock_llm" + + def _get_infer_url(self, model_name, route): + return f"http://localhost:8000/v2/models/{model_name}/{route}" + + def generate_stream(self, model_name, inputs, stream=False): + headers = {"Accept": "text/event-stream"} + url = self._get_infer_url(model_name, "generate_stream") + # stream=True used to indicate response can be iterated over, which + # should be the common setting for generate_stream. + # For correctness test case, stream=False so that we can re-examine + # the response content. + return requests.post( + url, + data=inputs if isinstance(inputs, str) else json.dumps(inputs), + headers=headers, + stream=stream, + ) + + def generate(self, model_name, inputs): + url = self._get_infer_url(model_name, "generate") + return requests.post( + url, data=inputs if isinstance(inputs, str) else json.dumps(inputs) + ) + + def generate_expect_failure(self, model_name, inputs, msg): + url = self._get_infer_url(model_name, "generate") + r = requests.post( + url, data=inputs if isinstance(inputs, str) else json.dumps(inputs) + ) + # Content-Type header should always be JSON for errors + self.assertEqual(r.headers["Content-Type"], "application/json") + + try: + r.raise_for_status() + self.assertTrue(False, f"Expected failure, success for {inputs}") + except requests.exceptions.HTTPError as e: + self.assertIn(msg, r.json()["error"]) + + def generate_stream_expect_failure(self, model_name, inputs, msg): + r = self.generate_stream(model_name, inputs) + # Content-Type header should always be JSON for errors + self.assertEqual(r.headers["Content-Type"], "application/json") + + try: + r.raise_for_status() + self.assertTrue(False, f"Expected failure, success for {inputs}") + except requests.exceptions.HTTPError as e: + self.assertIn(msg, r.json()["error"]) + + def generate_stream_expect_success( + self, model_name, inputs, expected_output, rep_count + ): + r = self.generate_stream(model_name, inputs) + r.raise_for_status() + self.check_sse_responses(r, [{"TEXT": expected_output}] * rep_count) + + def check_sse_responses(self, res, expected_res): + # Validate SSE format + self.assertIn("Content-Type", res.headers) + self.assertEqual( + "text/event-stream; charset=utf-8", res.headers["Content-Type"] + ) + + # SSE format (data: []) is hard to parse, use helper library for simplicity + client = sseclient.SSEClient(res) + res_count = 0 + for event in client.events(): + # Parse event data, join events into a single response + data = json.loads(event.data) + for key, value in expected_res[res_count].items(): + self.assertIn(key, data) + self.assertEqual(value, data[key]) + res_count += 1 + self.assertEqual(len(expected_res), res_count) + # Make sure there is no message in the wrong form + for remaining in client._read(): + self.assertTrue( + remaining.startswith(b"data:"), + f"SSE response not formed properly, got: {remaining}", + ) + self.assertTrue( + remaining.endswith(b"\n\n"), + f"SSE response not formed properly, got: {remaining}", + ) + + def test_generate(self): + # Setup text-based input + text = "hello world" + inputs = {"PROMPT": text, "STREAM": False} + + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertIn("TEXT", data) + self.assertEqual(text, data["TEXT"]) + + def test_generate_with_all_inputs(self): + # Setup text-based input + text = "hello world" + inputs = {"PROMPT": text, "STREAM": False, "input_ids": [100, 200]} + + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertIn("TEXT", data) + self.assertEqual(text, data["TEXT"]) + + def test_request_id(self): + # Setup text based input + text = "hello world" + request_id = "42" + + # Test when request id in request body + inputs = {"PROMPT": text, "id": request_id, "STREAM": False} + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertIn("id", data) + self.assertEqual(request_id, data["id"]) + self.assertIn("TEXT", data) + self.assertEqual(text, data["TEXT"]) + + # Test when request id not in request body + inputs = {"PROMPT": text, "STREAM": False} + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertNotIn("id", data) + + # Test when request id is empty + inputs = {"PROMPT": text, "id": "", "STREAM": False} + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertNotIn("id", data) + self.assertIn("TEXT", data) + self.assertEqual(text, data["TEXT"]) + + def test_generate_stream(self): + # Setup text-based input + text = "hello world" + rep_count = 3 + inputs = {"PROMPT": [text], "STREAM": True, "REPETITION": rep_count} + self.generate_stream_expect_success(self._model_name, inputs, text, rep_count) + + def test_streaming(self): + # verify the responses are streamed as soon as it is generated + text = "hello world" + rep_count = 3 + inputs = {"PROMPT": [text], "STREAM": True, "REPETITION": rep_count, "DELAY": 2} + past = time.time() + res = self.generate_stream(self._model_name, inputs, stream=True) + client = sseclient.SSEClient(res) + # This test does not focus on event content + for _ in client.events(): + now = time.time() + self.assertTrue(1 < (now - past) < 3) + past = now + + def test_missing_inputs(self): + missing_all_inputs = [ + # Missing all inputs + {}, + {"abc": 123}, + ] + missing_one_input = [ + # Missing 1 input + {"PROMPT": "hello"}, + {"STREAM": False}, + {"STREAM": False, "other": "param"}, + ] + for inputs in missing_all_inputs: + self.generate_expect_failure( + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 0", + ) + self.generate_stream_expect_failure( + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 0", + ) + + for inputs in missing_one_input: + self.generate_expect_failure( + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 1", + ) + self.generate_stream_expect_failure( + self._model_name, + inputs, + "expected number of inputs between 2 and 3 but got 1", + ) + + def test_invalid_input_types(self): + invalid_bool = "attempt to access JSON non-boolean as boolean" + invalid_string = "attempt to access JSON non-string as string" + invalid_type_inputs = [ + # Prompt bad type + ({"PROMPT": 123, "STREAM": False}, invalid_string), + # Stream bad type + ({"PROMPT": "hello", "STREAM": "false"}, invalid_bool), + # Both bad type, parsed in order + ({"PROMPT": True, "STREAM": 123}, invalid_string), + ({"STREAM": 123, "PROMPT": True}, invalid_bool), + ] + + for inputs, error_msg in invalid_type_inputs: + self.generate_expect_failure(self._model_name, inputs, error_msg) + self.generate_stream_expect_failure(self._model_name, inputs, error_msg) + + def test_duplicate_inputs(self): + dupe_prompt = "input 'PROMPT' already exists in request" + dupe_stream = "input 'STREAM' already exists in request" + # Use JSON string directly as Python Dict doesn't support duplicate keys + invalid_type_inputs = [ + # One duplicate + ( + '{"PROMPT": "hello", "STREAM": false, "PROMPT": "duplicate"}', + dupe_prompt, + ), + ('{"PROMPT": "hello", "STREAM": false, "STREAM": false}', dupe_stream), + # Multiple duplicates, parsed in order + ( + '{"PROMPT": "hello", "STREAM": false, "PROMPT": "duplicate", "STREAM": true}', + dupe_prompt, + ), + ( + '{"PROMPT": "hello", "STREAM": false, "STREAM": true, "PROMPT": "duplicate"}', + dupe_stream, + ), + ] + for inputs, error_msg in invalid_type_inputs: + self.generate_expect_failure(self._model_name, inputs, error_msg) + self.generate_stream_expect_failure(self._model_name, inputs, error_msg) + + def test_generate_stream_response_error(self): + # Setup text-based input + text = "hello world" + inputs = {"PROMPT": [text], "STREAM": True, "REPETITION": 0, "FAIL_LAST": True} + r = self.generate_stream(self._model_name, inputs) + + # With "REPETITION": 0, error will be first response and the HTTP code + # will be set properly + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + self.check_sse_responses(r, [{"error": "An Error Occurred"}]) + + # With "REPETITION" > 0, the first response is valid response and set + # HTTP code to success, so user must validate each response + inputs["REPETITION"] = 1 + r = self.generate_stream(self._model_name, inputs) + r.raise_for_status() + + self.check_sse_responses(r, [{"TEXT": text}, {"error": "An Error Occurred"}]) + + def test_race_condition(self): + # In Triton HTTP frontend, the HTTP response is sent in a different + # thread than Triton response complete thread, both programs have shared + # access to the same object, so this test is sending sufficient load to + # the endpoint, in attempt to expose race condition if any . + input1 = {"PROMPT": "hello", "STREAM": False, "param": "segfault"} + input2 = { + "PROMPT": "hello", + "STREAM": True, + "REPETITION": 3, + "param": "segfault", + } + threads = [] + + def thread_func(model_name, inputs): + self.generate_stream(model_name, inputs).raise_for_status() + + for _ in range(50): + threads.append( + threading.Thread(target=thread_func, args=((self._model_name, input1))) + ) + threads.append( + threading.Thread(target=thread_func, args=((self._model_name, input2))) + ) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + def test_one_response(self): + # In the current 'inputs' setting, the model will send at least 1 + # response, "STREAM" controls model behavior on sending model responses: + # If True, the model sends two responses, one is the actual infer + # response and the other contains flag only to signal end of response. + # 'generate_stream' endpoint is designed for this case so it should send + # infer response and complete HTTP response appropriately. And + # 'generate' endpoint will be able to handle this case as at its core + # only one infer response is received, which is the same as typical HTTP + # usage. + # If False, the model sends one response containing infer response and + # end flag, which is the same as how non-decoupled model responds. + inputs = {"PROMPT": "hello world", "STREAM": True} + r = self.generate_stream(self._model_name, inputs) + r.raise_for_status() + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + inputs["STREAM"] = False + r = self.generate_stream(self._model_name, inputs) + r.raise_for_status() + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + def test_zero_response(self): + inputs = {"PROMPT": "hello world", "STREAM": True, "REPETITION": 0} + r = self.generate_stream(self._model_name, inputs) + r.raise_for_status() + # Expect generate fails the inference + r = self.generate(self._model_name, inputs) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + self.assertIn( + "generate expects model to produce exactly 1 response", + r.json()["error"], + ) + + def test_many_response(self): + inputs = {"PROMPT": "hello world", "STREAM": True, "REPETITION": 2} + r = self.generate_stream(self._model_name, inputs) + r.raise_for_status() + # Expect generate fails the inference + r = self.generate(self._model_name, inputs) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + self.assertIn( + "generate expects model to produce exactly 1 response", + r.json()["error"], + ) + + def test_complex_schema(self): + # Currently only the fundamental conversion is supported, nested object + # in the request will results in parsing error + + # complex object to parameters (specifying non model input) + inputs = { + "PROMPT": "hello world", + "STREAM": True, + "PARAMS": {"PARAM_0": 0, "PARAM_1": True, "PARAM_2": 123.123}, + } + r = self.generate(self._model_name, inputs) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + self.assertIn("parameter 'PARAMS' has invalid type", r.json()["error"]) + + # complex object to model input + inputs = { + "PROMPT": {"USER": "hello world", "BOT": "world hello"}, + "STREAM": True, + } + r = self.generate(self._model_name, inputs) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + self.assertIn( + "attempt to access JSON non-string as string", r.json()["error"] + ) + + def test_close_connection_during_streaming(self): + # verify the responses are streamed as soon as it is generated + text = "hello world" + rep_count = 3 + inputs = {"PROMPT": [text], "STREAM": True, "REPETITION": rep_count, "DELAY": 2} + res = self.generate_stream(self._model_name, inputs, stream=True) + # close connection while the responses are being generated + res.close() + # check server healthiness + health_url = "http://localhost:8000/v2/health/live" + requests.get(health_url).raise_for_status() + + def test_parameters(self): + # Test reserved nested object for parameters + text = "hello world" + rep_count = 3 + inputs = { + "PROMPT": [text], + "STREAM": True, + "parameters": {"REPETITION": rep_count}, + } + self.generate_stream_expect_success(self._model_name, inputs, text, rep_count) + + # parameters keyword is not an object + inputs = {"PROMPT": [text], "STREAM": True, "parameters": 1} + + r = self.generate(self._model_name, inputs) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + self.assertIn( + "Expected JSON object for keyword: 'parameters'", r.json()["error"] + ) + + # parameters contains complex object + inputs = { + "PROMPT": [text], + "STREAM": True, + "parameters": {"nested": {"twice": 1}}, + } + + r = self.generate(self._model_name, inputs) + try: + r.raise_for_status() + except requests.exceptions.HTTPError as e: + self.assertIn( + "Converting keyword: 'parameters': parameter 'nested' has invalid type.", + r.json()["error"], + ) + + def test_0_dimension_output(self): + # With the trtllm backend, if the end token is predicted at the first + # step, the output tensors will have the shapes with 0 dimension. + text = "hello world" + inputs = { + "PROMPT": text, + "STREAM": False, + "REPETITION": 0, + "OUTPUT_0_DIM": True, + } + + r = self.generate(self._model_name, inputs) + r.raise_for_status() + + self.assertIn("Content-Type", r.headers) + self.assertEqual(r.headers["Content-Type"], "application/json") + + data = r.json() + self.assertIn("TEXT", data) + self.assertEqual([], data["TEXT"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_http/generate_models/mock_llm/1/model.py b/qa/L0_http/generate_models/mock_llm/1/model.py new file mode 100644 index 0000000000..117b097c3d --- /dev/null +++ b/qa/L0_http/generate_models/mock_llm/1/model.py @@ -0,0 +1,109 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + self.decoupled = self.model_config.get("model_transaction_policy", {}).get( + "decoupled" + ) + + def execute(self, requests): + if self.decoupled: + return self.exec_decoupled(requests) + else: + return self.exec(requests) + + def exec(self, requests): + responses = [] + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + + input_np = pb_utils.get_input_tensor_by_name(request, "PROMPT").as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "STREAM").as_numpy() + stream = stream_np.flatten()[0] + if stream: + responses.append( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + "STREAM only supported in decoupled mode" + ) + ) + ) + else: + out_tensor = pb_utils.Tensor( + "TEXT", np.repeat(input_np, rep_count, axis=1) + ) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses + + def exec_decoupled(self, requests): + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + fail_last = params["FAIL_LAST"] if "FAIL_LAST" in params else False + delay = params["DELAY"] if "DELAY" in params else None + output_0_dim = params["OUTPUT_0_DIM"] if "OUTPUT_0_DIM" in params else False + + sender = request.get_response_sender() + input_np = pb_utils.get_input_tensor_by_name(request, "PROMPT").as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "STREAM").as_numpy() + out_value = np.array([]) if output_0_dim else input_np + out_tensor = pb_utils.Tensor("TEXT", out_value) + response = pb_utils.InferenceResponse([out_tensor]) + # If stream enabled, just send multiple copies of response + # FIXME: Could split up response string into tokens, but this is simpler for now. + stream = stream_np.flatten()[0] + if stream: + for _ in range(rep_count): + if delay is not None: + time.sleep(delay) + if not sender.is_cancelled(): + sender.send(response) + else: + break + sender.send( + None + if not fail_last + else pb_utils.InferenceResponse( + error=pb_utils.TritonError("An Error Occurred") + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + # If stream disabled, just send one response + else: + sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + return None diff --git a/qa/L0_http/generate_models/mock_llm/config.pbtxt b/qa/L0_http/generate_models/mock_llm/config.pbtxt new file mode 100644 index 0000000000..74a306052a --- /dev/null +++ b/qa/L0_http/generate_models/mock_llm/config.pbtxt @@ -0,0 +1,66 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +backend: "python" + +max_batch_size: 0 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "PROMPT" + data_type: TYPE_STRING + dims: [ 1, 1 ] + }, + { + name: "STREAM" + data_type: TYPE_BOOL + dims: [ 1, 1 ] + }, + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ 1, -1 ] + optional: true + } +] + +output [ + { + name: "TEXT" + data_type: TYPE_STRING + dims: [ 1, -1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/qa/L0_http/http_basic_auth_test.py b/qa/L0_http/http_basic_auth_test.py new file mode 100755 index 0000000000..5aa1f71d81 --- /dev/null +++ b/qa/L0_http/http_basic_auth_test.py @@ -0,0 +1,66 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import sys +import unittest + +sys.path.append("../common") + +import test_util as tu +import tritonclient.http as tritonhttpclient +import tritonclient.http.aio as asynctritonhttpclient +from tritonclient.http.aio.auth import BasicAuth as AsyncBasicAuth +from tritonclient.http.auth import BasicAuth + + +class HTTPBasicAuthTest(tu.TestResultCollector): + def setUp(self): + # Use the nginx port + self._client = tritonhttpclient.InferenceServerClient(url="localhost:8004") + self._client.register_plugin(BasicAuth("username", "password")) + + def test_client_call(self): + self.assertTrue(self._client.is_server_live()) + + def tearDown(self): + self._client.close() + + +class HTTPBasicAuthAsyncTest(unittest.IsolatedAsyncioTestCase): + async def asyncSetUp(self): + # Use the nginx port + self._client = asynctritonhttpclient.InferenceServerClient(url="localhost:8004") + self._client.register_plugin(AsyncBasicAuth("username", "password")) + + async def test_client_call(self): + self.assertTrue(await self._client.is_server_live()) + + async def asyncTearDown(self): + await self._client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_http/http_client_plugin_test.py b/qa/L0_http/http_client_plugin_test.py new file mode 100755 index 0000000000..963ea2a81b --- /dev/null +++ b/qa/L0_http/http_client_plugin_test.py @@ -0,0 +1,175 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest +from unittest.mock import AsyncMock, MagicMock, patch + +import numpy as np +import test_util as tu +import tritonclient.http as tritonhttpclient +import tritonclient.http.aio as asynctritonhttpclient +from tritonclient.http import InferenceServerClientPlugin +from tritonclient.utils import np_to_triton_dtype + + +# A simple plugin that adds headers to the inference request. +class TestPlugin(InferenceServerClientPlugin): + def __init__(self, headers): + self._headers = headers + + def __call__(self, request): + request.headers.update(self._headers) + + +class HTTPClientPluginAsyncTest(unittest.IsolatedAsyncioTestCase): + async def asyncSetUp(self): + self._headers = {"MY-KEY": "MY-VALUE"} + self._plugin = TestPlugin(self._headers) + self._client = asynctritonhttpclient.InferenceServerClient(url="localhost:8001") + + async def test_server_is_live(self): + # We are testing is_server_live as an example API that uses GET method + # for communication with the server. + self._client._stub.get = AsyncMock() + + self._client.register_plugin(self._plugin) + self.assertEqual(self._plugin, self._client.plugin()) + await self._client.is_server_live() + self._client._stub.get.assert_awaited_with( + url=unittest.mock.ANY, headers=self._headers + ) + + # Make sure unregistering the plugin would no longer add the headers + self._client.unregister_plugin() + self.assertEqual(None, self._client.plugin()) + await self._client.is_server_live() + self._client._stub.get.assert_awaited_with(url=unittest.mock.ANY, headers={}) + + async def test_simple_infer(self): + # Only the read function must return async + post_return = MagicMock() + post_return.read = AsyncMock() + self._client._stub.post = AsyncMock(return_value=post_return) + + np_input = np.arange(8, dtype=np.float32).reshape(1, -1) + model = "onnx_zero_1_float32" + + # Setup inputs + inputs = [] + inputs.append( + tritonhttpclient.InferInput( + "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype) + ) + ) + + # Set the binary data to False so that 'Inference-Header-Length' is not + # added to the headers. + inputs[0].set_data_from_numpy(np_input, binary_data=False) + + async def run_infer(headers): + with patch("tritonclient.http.aio._raise_if_error"): + with patch("tritonclient.http.aio.InferResult"): + await self._client.infer(model_name=model, inputs=inputs) + self._client._stub.post.assert_awaited_with( + url=unittest.mock.ANY, data=unittest.mock.ANY, headers=headers + ) + + self._client.register_plugin(self._plugin) + await run_infer(self._headers) + + self._client.unregister_plugin() + await run_infer({}) + + async def asyncTearDown(self): + await self._client.close() + + +class HTTPClientPluginTest(tu.TestResultCollector): + def setUp(self): + self._headers = {"MY-KEY": "MY-VALUE"} + self._plugin = TestPlugin(self._headers) + self._client = tritonhttpclient.InferenceServerClient(url="localhost:8001") + + # Use magic mock for the client stub + self._client._client_stub = MagicMock() + + def test_server_is_live(self): + # We are testing is_server_live as an example API that uses GET method + # for communication with the server. + self._client.register_plugin(self._plugin) + self._client.is_server_live() + self._client._client_stub.get.assert_called_with( + unittest.mock.ANY, headers=self._headers + ) + + # Make sure unregistering the plugin would no longer add the headers + self._client.unregister_plugin() + self._client.is_server_live() + self._client._client_stub.get.assert_called_with(unittest.mock.ANY, headers={}) + + def test_simple_infer(self): + np_input = np.arange(8, dtype=np.float32).reshape(1, -1) + model = "onnx_zero_1_float32" + + # Setup inputs + inputs = [] + inputs.append( + tritonhttpclient.InferInput( + "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype) + ) + ) + + # Set the binary data to False so that 'Inference-Header-Length' is not + # added to the headers. + inputs[0].set_data_from_numpy(np_input, binary_data=False) + + def run_infer(headers): + with patch("tritonclient.http._client._raise_if_error"): + with patch("tritonclient.http._client.InferResult"): + self._client.infer(model_name=model, inputs=inputs) + self._client._client_stub.post.assert_called_with( + request_uri=unittest.mock.ANY, + body=unittest.mock.ANY, + headers=headers, + ) + + self._client.register_plugin(self._plugin) + run_infer(self._headers) + + self._client.unregister_plugin() + run_infer({}) + + def tearDown(self): + self._client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_http/http_restricted_api_test.py b/qa/L0_http/http_restricted_api_test.py new file mode 100755 index 0000000000..e5e3d5fd2d --- /dev/null +++ b/qa/L0_http/http_restricted_api_test.py @@ -0,0 +1,94 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import tritonclient.http as tritonhttpclient +from tritonclient.utils import InferenceServerException + + +class RestrictedAPITest(unittest.TestCase): + def setUp(self): + self.model_name_ = "simple" + self.client_ = tritonhttpclient.InferenceServerClient("localhost:8000") + + # Other unspecified APIs should not be restricted + def test_sanity(self): + self.client_.get_inference_statistics("simple") + self.client_.get_inference_statistics( + "simple", headers={"infer-key": "infer-value"} + ) + + # metadata, infer, model repository APIs are restricted. + # metadata and infer expects "infer-key : infer-value" header, + # model repository expected "admin-key : admin-value". + def test_model_repository(self): + with self.assertRaisesRegex(InferenceServerException, "This API is restricted"): + self.client_.unload_model( + self.model_name_, headers={"infer-key": "infer-value"} + ) + # Request go through and get actual transaction error + with self.assertRaisesRegex( + InferenceServerException, "explicit model load / unload is not allowed" + ): + self.client_.unload_model( + self.model_name_, headers={"admin-key": "admin-value"} + ) + + def test_metadata(self): + with self.assertRaisesRegex(InferenceServerException, "This API is restricted"): + self.client_.get_server_metadata() + self.client_.get_server_metadata({"infer-key": "infer-value"}) + + def test_infer(self): + # setup + inputs = [ + tritonhttpclient.InferInput("INPUT0", [1, 16], "INT32"), + tritonhttpclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + inputs[0].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32)) + inputs[1].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32)) + + # This test only care if the request goes through + with self.assertRaisesRegex(InferenceServerException, "This API is restricted"): + _ = self.client_.infer( + model_name=self.model_name_, inputs=inputs, headers={"test": "1"} + ) + self.client_.infer( + model_name=self.model_name_, + inputs=inputs, + headers={"infer-key": "infer-value"}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_http/http_test.py b/qa/L0_http/http_test.py new file mode 100755 index 0000000000..4432fe9186 --- /dev/null +++ b/qa/L0_http/http_test.py @@ -0,0 +1,236 @@ +#!/usr/bin/python +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import threading +import time +import unittest + +import numpy as np +import requests +import test_util as tu +import tritonclient.http as tritonhttpclient +from tritonclient.utils import InferenceServerException, np_to_triton_dtype + + +class HttpTest(tu.TestResultCollector): + def _get_infer_url(self, model_name): + return "http://localhost:8000/v2/models/{}/infer".format(model_name) + + def _raw_binary_helper( + self, model, input_bytes, expected_output_bytes, extra_headers={} + ): + # Select model that satisfies constraints for raw binary request + headers = {"Inference-Header-Content-Length": "0"} + # Add extra headers (if any) before sending request + headers.update(extra_headers) + r = requests.post(self._get_infer_url(model), data=input_bytes, headers=headers) + r.raise_for_status() + + # Get the inference header size so we can locate the output binary data + header_size = int(r.headers["Inference-Header-Content-Length"]) + # Assert input == output since this tests an identity model + self.assertEqual( + expected_output_bytes, + r.content[header_size:], + "Expected response body contains correct output binary data: {}; got: {}".format( + expected_output_bytes, r.content[header_size:] + ), + ) + + def test_raw_binary(self): + model = "onnx_zero_1_float32" + input_bytes = np.arange(8, dtype=np.float32).tobytes() + self._raw_binary_helper(model, input_bytes, input_bytes) + + def test_raw_binary_longer(self): + # Similar to test_raw_binary but test with different data size + model = "onnx_zero_1_float32" + input_bytes = np.arange(32, dtype=np.float32).tobytes() + self._raw_binary_helper(model, input_bytes, input_bytes) + + def test_byte(self): + # Select model that satisfies constraints for raw binary request + # i.e. BYTE type the element count must be 1 + model = "onnx_zero_1_object_1_element" + input = "427" + headers = {"Inference-Header-Content-Length": "0"} + r = requests.post(self._get_infer_url(model), data=input, headers=headers) + r.raise_for_status() + + # Get the inference header size so we can locate the output binary data + header_size = int(r.headers["Inference-Header-Content-Length"]) + # Triton returns BYTES tensor with byte size prepended + output = r.content[header_size + 4 :].decode() + self.assertEqual( + input, + output, + "Expected response body contains correct output binary data: {}; got: {}".format( + input, output + ), + ) + + def test_byte_too_many_elements(self): + # Select model that doesn't satisfy constraints for raw binary request + # i.e. BYTE type the element count must be 1 + model = "onnx_zero_1_object" + input = "427" + headers = {"Inference-Header-Content-Length": "0"} + r = requests.post(self._get_infer_url(model), data=input, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + self.assertIn( + "For BYTE datatype raw input 'INPUT0', the model must have input shape [1]", + r.content.decode(), + ) + + def test_multi_variable_dimensions(self): + # Select model that doesn't satisfy constraints for raw binary request + # i.e. this model has multiple variable-sized dimensions + model = "onnx_zero_1_float16" + input = np.ones([2, 2], dtype=np.float16) + headers = {"Inference-Header-Content-Length": "0"} + r = requests.post( + self._get_infer_url(model), data=input.tobytes(), headers=headers + ) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + self.assertIn( + "The shape of the raw input 'INPUT0' can not be deduced because there are more than one variable-sized dimension", + r.content.decode(), + ) + + def test_multi_inputs(self): + # Select model that doesn't satisfy constraints for raw binary request + # i.e. input count must be 1 + model = "onnx_zero_3_float32" + # Use one numpy array, after tobytes() it can be seen as three inputs + # each with 8 elements (this ambiguity is why this is not allowed) + input = np.arange(24, dtype=np.float32) + headers = {"Inference-Header-Content-Length": "0"} + r = requests.post( + self._get_infer_url(model), data=input.tobytes(), headers=headers + ) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + self.assertIn( + "Raw request must only have 1 input (found 1) to be deduced but got 3 inputs in", + r.content.decode(), + ) + + # This is to test that a properly chunk-encoded request by the caller works, + # though Triton does not specifically do any special chunk handling outside + # of underlying HTTP libraries used + # Future Enhancement: Test other encodings as they come up + def test_content_encoding_chunked_manually(self): + # Similar to test_raw_binary but test with extra headers + extra_headers = {"Transfer-Encoding": "chunked"} + model = "onnx_zero_1_float32" + input_bytes = np.arange(8, dtype=np.float32).tobytes() + # Encode input into a single chunk (for simplicity) following chunked + # encoding format: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding + chunk_encoded_input = b"" + # Length of chunk in hexadecimal and line separator + chunk_encoded_input += f"{len(input_bytes):X}\r\n".encode("utf-8") + # Chunk bytes and line separator + chunk_encoded_input += input_bytes + b"\r\n" + # Final byte (0) and end message + chunk_encoded_input += b"0\r\n\r\n" + self._raw_binary_helper(model, chunk_encoded_input, input_bytes, extra_headers) + + # Test that Python client rejects any "Transfer-Encoding" HTTP headers + # as we don't specially handle encoding requests for the user through + # these headers. There are special arguments exposed in the client to + # handle some "Content-Encoding" headers. + def test_content_encoding_unsupported_client(self): + for encoding in ["chunked", "compress", "deflate", "gzip"]: + with self.subTest(encoding=encoding): + headers = {"Transfer-Encoding": encoding} + np_input = np.arange(8, dtype=np.float32).reshape(1, -1) + model = "onnx_zero_1_float32" + # Setup inputs + inputs = [] + inputs.append( + tritonhttpclient.InferInput( + "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype) + ) + ) + inputs[0].set_data_from_numpy(np_input) + + with tritonhttpclient.InferenceServerClient("localhost:8000") as client: + # Python client is expected to raise an exception to reject + # 'content-encoding' HTTP headers. + with self.assertRaisesRegex( + InferenceServerException, "Unsupported HTTP header" + ): + client.infer(model_name=model, inputs=inputs, headers=headers) + + def test_descriptive_status_code(self): + model = "onnx_zero_1_float32_queue" + input_bytes = np.arange(8, dtype=np.float32).tobytes() + + # Send two requests to model that only queues 1 request at the maximum, + # Expect the second request will be rejected with HTTP status code that + # aligns with error detail (server unavailable). + t = threading.Thread( + target=self._raw_binary_helper, args=(model, input_bytes, input_bytes) + ) + t.start() + time.sleep(0.5) + with self.assertRaises(requests.exceptions.HTTPError) as context: + self._raw_binary_helper(model, input_bytes, input_bytes) + self.assertEqual( + 503, + context.exception.response.status_code, + "Expected error code {} returned for the request; got: {}".format( + 503, + context.exception.response.status_code, + ), + ) + t.join() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_http/nginx.conf b/qa/L0_http/nginx.conf new file mode 100644 index 0000000000..fb62ca719c --- /dev/null +++ b/qa/L0_http/nginx.conf @@ -0,0 +1,57 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +worker_processes 1; + +error_log /var/log/nginx/error.log; + +events { + worker_connections 1024; +} + +http { + # Configure basic authentication + auth_basic "Restricted Content"; + auth_basic_user_file /opt/tritonserver/qa/L0_http/pswd; + + # Define upstream server + upstream backend { + server localhost:8000; + } + + # Define server block for reverse proxy + server { + listen 8004; + + # Configure location for reverse proxy + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + } + } +} diff --git a/qa/L0_http/python_http_aio_test.py b/qa/L0_http/python_http_aio_test.py new file mode 100755 index 0000000000..bd8d342bb1 --- /dev/null +++ b/qa/L0_http/python_http_aio_test.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import tritonclient.http.aio as httpclient +from tritonclient.utils import * + + +class TestHttpAioClient(unittest.IsolatedAsyncioTestCase): + """Test if aio rpc can reach the server""" + + async def asyncSetUp(self): + self._triton_client = httpclient.InferenceServerClient(url="localhost:8000") + + async def asyncTearDown(self): + await self._triton_client.close() + + async def test_is_server_live(self): + ret = await self._triton_client.is_server_live() + self.assertEqual(ret, True) + + async def test_is_server_ready(self): + ret = await self._triton_client.is_server_ready() + self.assertEqual(ret, True) + + async def test_is_model_ready(self): + ret = await self._triton_client.is_model_ready("simple") + self.assertEqual(ret, True) + + async def test_get_server_metadata(self): + ret = await self._triton_client.get_server_metadata() + self.assertEqual(ret["name"], "triton") + + async def test_get_model_metadata(self): + ret = await self._triton_client.get_model_metadata("simple") + self.assertEqual(ret["name"], "simple") + + async def test_get_model_config(self): + ret = await self._triton_client.get_model_config("simple") + self.assertEqual(ret["name"], "simple") + + async def test_get_model_repository_index(self): + ret = await self._triton_client.get_model_repository_index() + self.assertEqual(len(ret), 7) + + async def test_load_model(self): + with self.assertRaisesRegex( + InferenceServerException, + "explicit model load / unload is not allowed if polling is enabled", + ): + await self._triton_client.load_model("simple") + + async def test_unload_model(self): + with self.assertRaisesRegex( + InferenceServerException, + "explicit model load / unload is not allowed if polling is enabled", + ): + await self._triton_client.load_model("simple") + + async def test_get_inference_statistics(self): + await self._triton_client.get_inference_statistics() + + async def test_update_trace_settings(self): + await self._triton_client.update_trace_settings() + + async def test_get_trace_settings(self): + await self._triton_client.get_trace_settings() + + async def test_get_system_shared_memory_status(self): + await self._triton_client.get_system_shared_memory_status() + + async def test_register_system_shared_memory(self): + with self.assertRaisesRegex(InferenceServerException, ""): + await self._triton_client.register_system_shared_memory("", "", 0) + + async def test_unregister_system_shared_memory(self): + await self._triton_client.unregister_system_shared_memory() + + async def test_get_cuda_shared_memory_status(self): + await self._triton_client.get_cuda_shared_memory_status() + + async def test_register_cuda_shared_memory(self): + with self.assertRaisesRegex(InferenceServerException, ""): + await self._triton_client.register_cuda_shared_memory("", b"", 0, 0) + + async def test_unregister_cuda_shared_memory(self): + await self._triton_client.unregister_cuda_shared_memory() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh new file mode 100755 index 0000000000..572c527ba4 --- /dev/null +++ b/qa/L0_http/test.sh @@ -0,0 +1,758 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +CLIENT_PLUGIN_TEST="./http_client_plugin_test.py" +BASIC_AUTH_TEST="./http_basic_auth_test.py" +RESTRICTED_API_TEST="./http_restricted_api_test.py" +NGINX_CONF="./nginx.conf" +# On windows the paths invoked by the script (running in WSL) must use +# /mnt/c when needed but the paths on the tritonserver command-line +# must be C:/ style. +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then + SDKDIR=${SDKDIR:=C:/sdk} + MODELDIR=${MODELDIR:=C:/models} + DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} + BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} + SERVER=${SERVER:=/mnt/c/tritonserver/bin/tritonserver.exe} + + SIMPLE_AIO_INFER_CLIENT_PY=${SDKDIR}/python/simple_http_aio_infer_client.py + SIMPLE_HEALTH_CLIENT_PY=${SDKDIR}/python/simple_http_health_metadata.py + SIMPLE_INFER_CLIENT_PY=${SDKDIR}/python/simple_http_infer_client.py + SIMPLE_ASYNC_INFER_CLIENT_PY=${SDKDIR}/python/simple_http_async_infer_client.py + SIMPLE_STRING_INFER_CLIENT_PY=${SDKDIR}/python/simple_http_string_infer_client.py + SIMPLE_IMAGE_CLIENT_PY=${SDKDIR}/python/image_client.py + # SIMPLE_ENSEMBLE_IMAGE_CLIENT_PY=${SDKDIR}/python/ensemble_image_client.py + SIMPLE_SHM_STRING_CLIENT_PY=${SDKDIR}/python/simple_http_shm_string_client.py + SIMPLE_SHM_CLIENT_PY=${SDKDIR}/python/simple_http_shm_client.py + SIMPLE_CUDASHM_CLIENT_PY=${SDKDIR}/python/simple_http_cudashm_client.py + SIMPLE_MODEL_CONTROL_PY=${SDKDIR}/python/simple_http_model_control.py + SIMPLE_SEQUENCE_INFER_CLIENT_PY=${SDKDIR}/python/simple_http_sequence_sync_infer_client.py + SIMPLE_REUSE_INFER_OBJECTS_CLIENT_PY=${SDKDIR}/python/reuse_infer_objects_client.py + + SIMPLE_HEALTH_CLIENT=${SDKDIR}/python/simple_http_health_metadata + SIMPLE_INFER_CLIENT=${SDKDIR}/python/simple_http_infer_client + SIMPLE_STRING_INFER_CLIENT=${SDKDIR}/python/simple_http_string_infer_client + SIMPLE_ASYNC_INFER_CLIENT=${SDKDIR}/python/simple_http_async_infer_client + SIMPLE_MODEL_CONTROL=${SDKDIR}/python/simple_http_model_control + SIMPLE_SEQUENCE_INFER_CLIENT=${SDKDIR}/python/simple_http_sequence_sync_infer_client + SIMPLE_SHM_CLIENT=${SDKDIR}/python/simple_http_shm_client + SIMPLE_CUDASHM_CLIENT=${SDKDIR}/python/simple_http_cudashm_client + SIMPLE_REUSE_INFER_OBJECTS_CLIENT=${SDKDIR}/python/reuse_infer_objects_client + # [FIXME] point to proper client + CC_UNIT_TEST=${SDKDIR}/python/cc_client_test +else + MODELDIR=${MODELDIR:=`pwd`/models} + DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} + SERVER=${TRITON_DIR}/bin/tritonserver + BACKEND_DIR=${TRITON_DIR}/backends + + SIMPLE_AIO_INFER_CLIENT_PY=../clients/simple_http_aio_infer_client.py + SIMPLE_HEALTH_CLIENT_PY=../clients/simple_http_health_metadata.py + SIMPLE_INFER_CLIENT_PY=../clients/simple_http_infer_client.py + SIMPLE_ASYNC_INFER_CLIENT_PY=../clients/simple_http_async_infer_client.py + SIMPLE_STRING_INFER_CLIENT_PY=../clients/simple_http_string_infer_client.py + SIMPLE_IMAGE_CLIENT_PY=../clients/image_client.py + # SIMPLE_ENSEMBLE_IMAGE_CLIENT_PY=../clients/ensemble_image_client.py + SIMPLE_SHM_STRING_CLIENT_PY=../clients/simple_http_shm_string_client.py + SIMPLE_SHM_CLIENT_PY=../clients/simple_http_shm_client.py + SIMPLE_CUDASHM_CLIENT_PY=../clients/simple_http_cudashm_client.py + SIMPLE_MODEL_CONTROL_PY=../clients/simple_http_model_control.py + SIMPLE_SEQUENCE_INFER_CLIENT_PY=../clients/simple_http_sequence_sync_infer_client.py + SIMPLE_REUSE_INFER_OBJECTS_CLIENT_PY=../clients/reuse_infer_objects_client.py + + SIMPLE_HEALTH_CLIENT=../clients/simple_http_health_metadata + SIMPLE_INFER_CLIENT=../clients/simple_http_infer_client + SIMPLE_STRING_INFER_CLIENT=../clients/simple_http_string_infer_client + SIMPLE_ASYNC_INFER_CLIENT=../clients/simple_http_async_infer_client + SIMPLE_MODEL_CONTROL=../clients/simple_http_model_control + SIMPLE_SEQUENCE_INFER_CLIENT=../clients/simple_http_sequence_sync_infer_client + SIMPLE_SHM_CLIENT=../clients/simple_http_shm_client + SIMPLE_CUDASHM_CLIENT=../clients/simple_http_cudashm_client + SIMPLE_REUSE_INFER_OBJECTS_CLIENT=../clients/reuse_infer_objects_client + CC_UNIT_TEST=../clients/cc_client_test +fi + +# Add string_dyna_sequence model to repo +cp -r ${MODELDIR}/simple_dyna_sequence ${MODELDIR}/simple_string_dyna_sequence +sed -i "s/simple_dyna_sequence/simple_string_dyna_sequence/g" ${MODELDIR}/simple_string_dyna_sequence/config.pbtxt +sed -i "s/^platform: .*/backend: \"dyna_sequence\"/g" ${MODELDIR}/simple_string_dyna_sequence/config.pbtxt +sed -i "/CONTROL_SEQUENCE_CORRID/{n;s/data_type:.*/data_type: TYPE_STRING/}" ${MODELDIR}/simple_string_dyna_sequence/config.pbtxt +rm -f ${MODELDIR}/simple_string_dyna_sequence/1/model.graphdef +cp ../custom_models/custom_dyna_sequence_int32/1/libtriton_dyna_sequence.so ${MODELDIR}/simple_string_dyna_sequence/1/ + +rm -f *.log +rm -f *.log.* + +set -e + +CLIENT_LOG=`pwd`/client.log +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" +source ../common/util.sh + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Test health +python $SIMPLE_HEALTH_CLIENT_PY -v >> ${CLIENT_LOG}.health 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.health + RET=1 +fi + +IMAGE=../images/vulture.jpeg +for i in \ + $SIMPLE_AIO_INFER_CLIENT_PY \ + $SIMPLE_INFER_CLIENT_PY \ + $SIMPLE_ASYNC_INFER_CLIENT_PY \ + $SIMPLE_IMAGE_CLIENT_PY \ + $SIMPLE_ENSEMBLE_IMAGE_CLIENT_PY \ + $SIMPLE_SHM_STRING_CLIENT_PY \ + $SIMPLE_SHM_CLIENT_PY \ + $SIMPLE_CUDASHM_CLIENT_PY \ + $SIMPLE_STRING_INFER_CLIENT_PY \ + $SIMPLE_SEQUENCE_INFER_CLIENT_PY \ + ; do + BASE=$(basename -- $i) + SUFFIX="${BASE%.*}" + if [ $SUFFIX == "image_client" ]; then + python $i -m inception_graphdef -s INCEPTION -a -c 1 -b 1 $IMAGE >> "${CLIENT_LOG}.async.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.async.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.async.${SUFFIX} + RET=1 + fi + python $i -m inception_graphdef -s INCEPTION -c 1 -b 1 $IMAGE >> "${CLIENT_LOG}.${SUFFIX}" 2>&1 + if [ `grep -c VULTURE ${CLIENT_LOG}.${SUFFIX}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***" + cat $CLIENT_LOG.${SUFFIX} + RET=1 + fi + # elif [ $SUFFIX == "ensemble_image_client" ]; then + # python $i -c 1 ../images >> "${CLIENT_LOG}.${SUFFIX}" 2>&1 + # for result in "SPORTS CAR" "COFFEE MUG" "VULTURE"; do + # if [ `grep -c "$result" ${CLIENT_LOG}.${SUFFIX}` != "1" ]; then + # echo -e "\n***\n*** Failed. Expected 1 $result result\n***" + # RET=1 + # fi + # done + else + python $i -v >> "${CLIENT_LOG}.${SUFFIX}" 2>&1 + fi + + if [ $? -ne 0 ]; then + cat "${CLIENT_LOG}.${SUFFIX}" + RET=1 + fi +done + +# Test while reusing the InferInput and InferRequestedOutput objects +$SIMPLE_REUSE_INFER_OBJECTS_CLIENT_PY -v >> ${CLIENT_LOG}.reuse 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.reuse + RET=1 +fi + +# Test with the base path in url. +$SIMPLE_INFER_CLIENT_PY -u localhost:8000/base_path -v >> ${CLIENT_LOG}.base_path_url 2>&1 +if [ $? -eq 0 ]; then + cat ${CLIENT_LOG}.base_path_url + RET=1 +fi +if [ $(cat ${CLIENT_LOG}.base_path_url | grep "POST /base_path/v2/models/simple/infer" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}.base_path_url + RET=1 +fi + +for i in \ + $SIMPLE_INFER_CLIENT \ + $SIMPLE_STRING_INFER_CLIENT \ + $SIMPLE_ASYNC_INFER_CLIENT \ + $SIMPLE_HEALTH_CLIENT \ + $SIMPLE_SHM_CLIENT \ + $SIMPLE_CUDASHM_CLIENT \ + $SIMPLE_SEQUENCE_INFER_CLIENT \ + ; do + BASE=$(basename -- $i) + SUFFIX="${BASE%.*}" + + $i -v -H test:1 >> ${CLIENT_LOG}.c++.${SUFFIX} 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.${SUFFIX} + RET=1 + fi +done + +# Test with json input and output data +$SIMPLE_STRING_INFER_CLIENT --json-input-data --json-output-data >> ${CLIENT_LOG}.c++.json 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.json + RET=1 +fi + +# Test while reusing the InferInput and InferRequestedOutput objects +$SIMPLE_REUSE_INFER_OBJECTS_CLIENT -v >> ${CLIENT_LOG}.c++.reuse 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.reuse + RET=1 +fi + +python $CLIENT_PLUGIN_TEST >> ${CLIENT_LOG}.python.plugin 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.python.plugin + RET=1 +fi + +# Create a password file with username:password +echo -n 'username:' > pswd +echo "password" | openssl passwd -stdin -apr1 >> pswd +nginx -c `pwd`/$NGINX_CONF + +python $BASIC_AUTH_TEST +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.python.plugin.auth + RET=1 +fi +service nginx stop + +# Test with the base path in url. +$SIMPLE_INFER_CLIENT -u localhost:8000/base_path -v >> ${CLIENT_LOG}.c++.base_path_url 2>&1 +if [ $? -eq 0 ]; then + cat ${CLIENT_LOG}.c++.base_path_url + RET=1 +fi +if [ $(cat ${CLIENT_LOG}.c++.base_path_url | grep "POST /base_path/v2/models/simple/infer" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}.c++.base_path_url + RET=1 +fi + + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --model-control-mode=explicit" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Test Model Control API +python $SIMPLE_MODEL_CONTROL_PY -v >> ${CLIENT_LOG}.model_control 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.model_control + RET=1 +fi + +if [ $(cat ${CLIENT_LOG}.model_control | grep "PASS" | wc -l) -ne 1 ]; then + cat ${CLIENT_LOG}.model_control + RET=1 +fi +if [ $(cat ${SERVER_LOG} | grep "Invalid config override" | wc -l) -eq 0 ]; then + cat ${SERVER_LOG} + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --model-control-mode=explicit" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Test Model Control API +$SIMPLE_MODEL_CONTROL -v >> ${CLIENT_LOG}.c++.model_control 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.model_control + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test with dynamic sequence models +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server_dyna.log" +CLIENT_LOG="./client_dyna.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e + +for i in \ + $SIMPLE_SEQUENCE_INFER_CLIENT \ + $SIMPLE_SEQUENCE_INFER_CLIENT_PY; do + + $i -v -d >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test combinations of binary and JSON data +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server_binaryjson.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# no parameters, no outputs == json output +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]},{"name":"INPUT1","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\]" ./curl.out` != "1" ]; then + RET=1 +fi + +# binary_data=true on INPUT0, binary_data=false on INPUT1 +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]},{"name":"INPUT1","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]}],"outputs":[{"name":"OUTPUT0","parameters":{"binary_data":true}},{"name":"OUTPUT1","parameters":{"binary_data":false}}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32\]" ./curl.out` != "0" ]; then + RET=1 +fi +if [ `grep -c "\[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\]" ./curl.out` != "1" ]; then + RET=1 +fi + +# binary_data=true on INPUT0, binary_data not given on INPUT1 +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]},{"name":"INPUT1","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]}],"outputs":[{"name":"OUTPUT0","parameters":{"binary_data":true}},{"name":"OUTPUT1"}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32\]" ./curl.out` != "0" ]; then + RET=1 +fi +if [ `grep -c "\[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\]" ./curl.out` != "1" ]; then + RET=1 +fi + +# binary_data_output=true, no outputs requested +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"parameters":{"binary_data_output":true},"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]},{"name":"INPUT1","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32\]" ./curl.out` != "0" ]; then + RET=1 +fi +if [ `grep -c "\[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\]" ./curl.out` != "0" ]; then + RET=1 +fi + +# binary_data_output=true +# binary_data=false on INPUT0, binary_data not given on INPUT1 +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"parameters":{"binary_data_output":true},"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]},{"name":"INPUT1","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]}],"outputs":[{"name":"OUTPUT0","parameters":{"binary_data":false}},{"name":"OUTPUT1"}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\]" ./curl.out` != "1" ]; then + RET=1 +fi + +# Send bad request where the 'data' field misaligns with the 'shape' field of the input +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" == "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\{\"error\":\"Unable to parse 'data': Shape does not match true shape of 'data' field\"\}" ./curl.out` != "1" ]; then + RET=1 +fi + +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" == "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\{\"error\":\"Unable to parse 'data': Shape does not match true shape of 'data' field\"\}" ./curl.out` != "1" ]; then + RET=1 +fi + +# Check if the server is still working after the above bad requests +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -d'{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]},{"name":"INPUT1","datatype":"INT32","shape":[1,16],"data":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]}]}' localhost:8000/v2/models/simple/infer` +set -e +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ `grep -c "\[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\]" ./curl.out` != "1" ]; then + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Run cpp client unit test +rm -rf unit_test_models && mkdir unit_test_models +cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/. +cp -r ${MODELDIR}/simple unit_test_models/. + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=unit_test_models + --trace-file=global_unittest.log --trace-level=TIMESTAMPS --trace-rate=1" +SERVER_LOG="./inference_server_cc_unit_test.log" +CLIENT_LOG="./cc_unit_test.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Run all unit tests except load +$CC_UNIT_TEST --gtest_filter=HTTP*:-*Load* >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG} + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Run cpp client load API unit test +rm -rf unit_test_models && mkdir unit_test_models +cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/. +# Make only version 2, 3 is valid version directory while config requests 1, 3 +rm -rf unit_test_models/onnx_int32_int32_int32/1 + +# Start with EXPLICIT mode and load onnx_float32_float32_float32 +SERVER_ARGS="--model-repository=`pwd`/unit_test_models \ + --model-control-mode=explicit \ + --load-model=onnx_int32_int32_int32 \ + --strict-model-config=false" +SERVER_LOG="./inference_server_cc_unit_test.load.log" +CLIENT_LOG="./cc_unit_test.load.log" + +for i in \ + "LoadWithFileOverride" \ + "LoadWithConfigOverride" \ + ; do + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + $CC_UNIT_TEST --gtest_filter=HTTP*$i >> ${CLIENT_LOG}.$i 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.$i + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Run python http aio unit test +PYTHON_HTTP_AIO_TEST=python_http_aio_test.py +CLIENT_LOG=`pwd`/python_http_aio_test.log +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e +python $PYTHON_HTTP_AIO_TEST > $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Python HTTP AsyncIO Test Failed\n***" + RET=1 +fi +set -e +kill $SERVER_PID +wait $SERVER_PID + +# Run python unit test +MODELDIR=python_unit_test_models +mkdir -p $MODELDIR +rm -rf ${MODELDIR}/* +cp -r $DATADIR/qa_identity_model_repository/onnx_zero_1_float32 ${MODELDIR}/. +cp -r $DATADIR/qa_identity_model_repository/onnx_zero_1_object ${MODELDIR}/. +cp -r $DATADIR/qa_identity_model_repository/onnx_zero_1_float16 ${MODELDIR}/. +cp -r $DATADIR/qa_identity_model_repository/onnx_zero_3_float32 ${MODELDIR}/. +cp -r ${MODELDIR}/onnx_zero_1_object ${MODELDIR}/onnx_zero_1_object_1_element && \ + (cd $MODELDIR/onnx_zero_1_object_1_element && \ + sed -i "s/onnx_zero_1_object/onnx_zero_1_object_1_element/" config.pbtxt && \ + sed -i "0,/-1/{s/-1/1/}" config.pbtxt) +# Model for error code test +cp -r ${MODELDIR}/onnx_zero_1_float32 ${MODELDIR}/onnx_zero_1_float32_queue && \ + (cd $MODELDIR/onnx_zero_1_float32_queue && \ + sed -i "s/onnx_zero_1_float32/onnx_zero_1_float32_queue/" config.pbtxt && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 1000000" >> config.pbtxt && \ + echo " preferred_batch_size: [ 8 ]" >> config.pbtxt && \ + echo " default_queue_policy {" >> config.pbtxt && \ + echo " max_queue_size: 1" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" +SERVER_LOG="./inference_server_http_test.log" +CLIENT_LOG="./http_test.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +TEST_RESULT_FILE='test_results.txt' +PYTHON_TEST=http_test.py +EXPECTED_NUM_TESTS=9 +set +e +python $PYTHON_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +### LLM / Generate REST API Endpoint Tests ### + +# Helper library to parse SSE events +# https://github.com/mpetazzoni/sseclient +pip install sseclient-py + +SERVER_ARGS="--model-repository=`pwd`/generate_models" +SERVER_LOG="./inference_server_generate_endpoint_test.log" +CLIENT_LOG="./generate_endpoint_test.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +## Python Unit Tests +TEST_RESULT_FILE='test_results.txt' +PYTHON_TEST=generate_endpoint_test.py +EXPECTED_NUM_TESTS=17 +set +e +python $PYTHON_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +### Test Restricted APIs ### +### Repeated API not allowed + +MODELDIR="`pwd`/models" +SERVER_ARGS="--model-repository=${MODELDIR} + --http-restricted-api=model-repository,health:k1=v1 \ + --http-restricted-api=metadata,health:k2=v2" +SERVER_LOG="./http_restricted_endpoint_test.log" +CLIENT_LOG="./http_restricted_endpoint_test.log" +run_server +EXPECTED_MSG="api 'health' can not be specified in multiple config groups" +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Expect fail to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +elif [ `grep -c "${EXPECTED_MSG}" ${SERVER_LOG}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected ${EXPECTED_MSG} to be found in log\n***" + cat $SERVER_LOG + RET=1 +fi + +### Test Unknown Restricted API### +### Unknown API not allowed + +MODELDIR="`pwd`/models" +SERVER_ARGS="--model-repository=${MODELDIR} + --http-restricted-api=model-reposit,health:k1=v1 \ + --http-restricted-api=metadata,health:k2=v2" +run_server +EXPECTED_MSG="unknown restricted api 'model-reposit'" +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Expect fail to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +elif [ `grep -c "${EXPECTED_MSG}" ${SERVER_LOG}` != "1" ]; then + echo -e "\n***\n*** Failed. Expected ${EXPECTED_MSG} to be found in log\n***" + cat $SERVER_LOG + RET=1 +fi + +### Test Restricted APIs ### +### Restricted model-repository, metadata, and inference + +SERVER_ARGS="--model-repository=${MODELDIR} \ + --http-restricted-api=model-repository:admin-key=admin-value \ + --http-restricted-api=inference,metadata:infer-key=infer-value" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +set +e + +python $RESTRICTED_API_TEST RestrictedAPITest > $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Python HTTP Restricted Protocol Test Failed\n***" + RET=1 +fi +set -e +kill $SERVER_PID +wait $SERVER_PID + +### + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_http_fuzz/fuzztest.py b/qa/L0_http_fuzz/fuzztest.py new file mode 100755 index 0000000000..8e84ffffc7 --- /dev/null +++ b/qa/L0_http_fuzz/fuzztest.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import glob +import os +import sqlite3 +import unittest + +import test_util as tu +from boofuzz import * + + +class FuzzTest(tu.TestResultCollector): + def _run_fuzz(self, url, logger): + session = Session( + target=Target(connection=TCPSocketConnection("127.0.0.1", 8000)), + fuzz_loggers=logger, + keep_web_open=False, + ) + + s_initialize(name="Request" + url) + with s_block("Request-Line"): + s_group( + "Method", + ["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE"], + ) + s_delim(" ", name="space-1") + s_string(url, name="Request-URI") + s_delim(" ", name="space-2") + s_string("HTTP/1.1", name="HTTP-Version") + s_static("\r\n", name="Request-Line-CRLF") + s_static("\r\n", "Request-CRLF") + + session.connect(s_get("Request" + url)) + session.fuzz() + + def test_failures_from_db(self): + url_list = [ + "/v2", + "/v2/models/simple", + "/v2/models/simple/infer", + "/v2/models/simple/versions/v1", + "/v2/models/simple/config", + "/v2/models/simple/stats", + "/v2/models/simple/ready", + "/v2/health/ready", + "/v2/health/live", + "/v2/repository/index", + "/v2/repository/models/simple/unload", + "/v2/repository/models/simple/load", + "/v2/systemsharedmemory/status", + "/v2/systemsharedmemory/register", + "/v2/systemsharedmemory/unregister", + "/v2/systemsharedmemory/region/xx/status", + "/v2/cudasharedmemory/status", + "/v2/cudasharedmemory/register", + "/v2/cudasharedmemory/unregister", + "/v2/cudasharedmemory/region/xx/status", + ] + + csv_log = open("fuzz_results.csv", "w") + logger = [FuzzLoggerCsv(file_handle=csv_log)] + + for url in url_list: + self._run_fuzz(url, logger) + + # Get latest db file + files = glob.glob("boofuzz-results/*") + dbfile = max(files, key=os.path.getctime) + + conn = sqlite3.connect(dbfile) + c = conn.cursor() + + # Get number of failures, should be 0 + self.assertEqual( + len([x for x in c.execute('SELECT * FROM steps WHERE type="fail"')]), 0 + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_http_fuzz/test.sh b/qa/L0_http_fuzz/test.sh new file mode 100755 index 0000000000..30cf02bc2a --- /dev/null +++ b/qa/L0_http_fuzz/test.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +RET=0 +rm -f *.log *.db +EXPECTED_NUM_TESTS="1" + +mkdir -p models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/savedmodel_zero_1_object models/ + +FUZZTEST=fuzztest.py +FUZZ_LOG=`pwd`/fuzz.log +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +source ../common/util.sh + +# Remove this once foobuzz and tornado packages upgrade to work with python 3.10 +# This test tests the server's ability to handle poor input and not the compatibility +# with python 3.10. Python 3.8 is ok to use here. +function_install_python38() { + source ../L0_backend_python/common.sh + install_conda + create_conda_env "3.8" "python-3-8" + + # Install test script dependencies + pip3 install --upgrade wheel setuptools boofuzz==0.3.0 "numpy<2" pillow attrdict future grpcio requests gsutil \ + awscli six grpcio-channelz prettytable virtualenv +} +function_install_python38 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Test health +python3 $FUZZTEST -v >> ${FUZZ_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat ${FUZZ_LOG} + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_https/nginx.conf b/qa/L0_https/nginx.conf new file mode 100644 index 0000000000..e3a78b14e1 --- /dev/null +++ b/qa/L0_https/nginx.conf @@ -0,0 +1,38 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +server { + listen 443 ssl; + server_name localhost; + + ssl_certificate /etc/nginx/cert.crt; + ssl_certificate_key /etc/nginx/cert.key; + + location / { + proxy_pass http://localhost:8000; + proxy_http_version 1.1; + } +} diff --git a/qa/L0_https/test.sh b/qa/L0_https/test.sh new file mode 100755 index 0000000000..2c030332e5 --- /dev/null +++ b/qa/L0_https/test.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +SIMPLE_AIO_INFER_CLIENT_PY=../clients/simple_http_aio_infer_client.py +SIMPLE_INFER_CLIENT_PY=../clients/simple_http_infer_client.py +TEST_CLIENT=../clients/simple_http_infer_client + +NGINX_CONF=`pwd`/nginx.conf +CLIENT_LOG=`pwd`/client.log +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +source ../common/util.sh + +rm -f *.key *.crt ${CLIENT_LOG}.* server.log + +# Generate valid CA +openssl genrsa -passout pass:1234 -des3 -out ca.key 4096 +openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA" + +# Generate valid Server Key/Cert +openssl genrsa -passout pass:1234 -des3 -out server.key 4096 +openssl req -passin pass:1234 -new -key server.key -out server.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost" +openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt + +# Remove passphrase from the Server Key +openssl rsa -passin pass:1234 -in server.key -out server.key + +# Generate valid Client Key/Cert +openssl genrsa -passout pass:1234 -des3 -out client.key 4096 +openssl req -passin pass:1234 -new -key client.key -out client.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost" +openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt + +# Remove passphrase from Client Key +openssl rsa -passin pass:1234 -in client.key -out client.key + +# Create mutated client key (Make first char of each like capital) +cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key +cp client.crt client2.crt && sed -i "s/\b\(.\)/\u\1/g" client2.crt + +mv server.crt /etc/nginx/cert.crt +mv server.key /etc/nginx/cert.key + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Setup the new configuration for the proxy. The HTTPS traffic will be +# redirected to the running instance of server at localhost:8000 +cp ${NGINX_CONF} /etc/nginx/sites-available/default + +# Start the proxy server +service nginx restart + +set +e + +# Test basic inference with https +python $SIMPLE_INFER_CLIENT_PY -v -u localhost --ssl --key-file client.key --cert-file client.crt --ca-certs ca.crt >> ${CLIENT_LOG}.ssl_infer 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.ssl_infer + RET=1 +fi +python $SIMPLE_AIO_INFER_CLIENT_PY -v -u localhost --ssl --key-file client.key --cert-file client.crt --ca-certs ca.crt >> ${CLIENT_LOG}.ssl_infer.aio 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.ssl_infer.aio + RET=1 +fi + +$TEST_CLIENT -v -u https://localhost:443 --key-file client.key --cert-file client.crt --ca-certs ca.crt >> ${CLIENT_LOG}.c++.ssl_infer 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.ssl_infer + RET=1 +fi + +# Test basic inference on https without peer verification +python $SIMPLE_INFER_CLIENT_PY -v -u localhost --ssl --insecure >> ${CLIENT_LOG}.ssl_infer_insecure 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.ssl_infer_insecure + RET=1 +fi +python $SIMPLE_AIO_INFER_CLIENT_PY -v -u localhost --ssl --insecure >> ${CLIENT_LOG}.ssl_infer_insecure.aio 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.ssl_infer_insecure.aio + RET=1 +fi + +$TEST_CLIENT -v -u https://localhost:443 --verify-host 0 --verify-peer 0 >> ${CLIENT_LOG}.c++.ssl_infer_insecure 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.ssl_infer_insecure + RET=1 +fi + +# Test failure cases for SSL +# Try without SSL +$SIMPLE_INFER_CLIENT_PY -v -u localhost >> ${CLIENT_LOG}.no_ssl_fail_infer 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.no_ssl_fail_infer + echo -e "\n***\n*** Expected test failure\n***" +else + RET=1 +fi +$SIMPLE_AIO_INFER_CLIENT_PY -v -u localhost >> ${CLIENT_LOG}.no_ssl_fail_infer.aio 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.no_ssl_fail_infer.aio + echo -e "\n***\n*** Expected test failure\n***" +else + RET=1 +fi + +$TEST_CLIENT -v -u https://localhost:443 >> ${CLIENT_LOG}.c++.no_ssl_fail_infer 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.no_ssl_fail_infer + echo -e "\n***\n*** Expected test failure\n***" +else + RET=1 +fi + + +# Try with incorrect key +$SIMPLE_INFER_CLIENT_PY -v -u localhost --ssl --key-file client2.key --cert-file client.crt --ca-certs ca.crt >> ${CLIENT_LOG}.ssl_wrong_key 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.ssl_wrong_key + echo -e "\n***\n*** Expected test failure\n***" +else + RET=1 +fi +$SIMPLE_AIO_INFER_CLIENT_PY -v -u localhost --ssl --key-file client2.key --cert-file client.crt --ca-certs ca.crt >> ${CLIENT_LOG}.ssl_wrong_key.aio 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.ssl_wrong_key.aio + echo -e "\n***\n*** Expected test failure\n***" +else + RET=1 +fi + +$TEST_CLIENT -v -u https://localhost:443 --key-file client2.key --cert-file client.crt --ca-certs ca.crt >> ${CLIENT_LOG}.c++.ssl_wrong_key 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.c++.ssl_wrong_key + echo -e "\n***\n*** Expected test failure\n***" +else + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Stop the proxy server +service nginx stop + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py new file mode 100755 index 0000000000..2cdf7ff2e0 --- /dev/null +++ b/qa/L0_implicit_state/implicit_state.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import unittest +from builtins import range + +import numpy as np +import test_util as tu +import tritonclient.http as tritonhttpclient +from tritonclient.utils import InferenceServerException + +BACKENDS = os.environ.get("BACKENDS", "onnx plan libtorch") + + +class ImplicitStateTest(tu.TestResultCollector): + def test_no_implicit_state(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32")) + inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32)) + inputs[1].set_data_from_numpy(np.asarray([0], dtype=np.int32)) + + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name="no_implicit_state", + inputs=inputs, + sequence_id=1, + sequence_start=True, + ) + + err_str = str(e.exception).lower() + self.assertIn("unable to add state 'undefined_state'", err_str) + self.assertIn( + "state configuration is missing for model 'no_implicit_state'", err_str + ) + + def test_wrong_implicit_state_name(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32")) + inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32)) + inputs[1].set_data_from_numpy(np.asarray([0], dtype=np.int32)) + + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name="wrong_internal_state", + inputs=inputs, + sequence_id=2, + sequence_start=True, + ) + + err_str = str(e.exception).lower() + self.assertIn("state 'undefined_state' is not a valid state name", err_str) + + def test_implicit_state_single_buffer(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32")) + inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32)) + inputs[1].set_data_from_numpy(np.asarray([2], dtype=np.int32)) + + triton_client.infer( + model_name="single_state_buffer", + inputs=inputs, + sequence_id=2, + sequence_start=True, + sequence_end=False, + ) + + triton_client.infer( + model_name="single_state_buffer", + inputs=inputs, + sequence_id=2, + sequence_start=False, + sequence_end=True, + ) + + def test_implicit_state_growable_memory(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32")) + inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32)) + inputs[1].set_data_from_numpy(np.asarray([3], dtype=np.int32)) + + output = triton_client.infer( + model_name="growable_memory", + inputs=inputs, + sequence_id=2, + sequence_start=True, + sequence_end=False, + ) + output_state = output.as_numpy("OUTPUT_STATE") + expected_output_state = np.zeros(output_state.shape, dtype=np.int8) + np.testing.assert_equal(output_state, expected_output_state) + + output = triton_client.infer( + model_name="growable_memory", + inputs=inputs, + sequence_id=2, + sequence_start=False, + sequence_end=False, + ) + output_state = output.as_numpy("OUTPUT_STATE") + expected_output_state = np.concatenate( + [expected_output_state, np.ones(expected_output_state.shape, dtype=np.int8)] + ) + np.testing.assert_equal(output_state, expected_output_state) + + output = triton_client.infer( + model_name="growable_memory", + inputs=inputs, + sequence_id=2, + sequence_start=False, + sequence_end=False, + ) + output_state = output.as_numpy("OUTPUT_STATE") + expected_output_state = np.concatenate( + [ + expected_output_state, + np.full( + (expected_output_state.shape[0] // 2,), dtype=np.int8, fill_value=2 + ), + ] + ) + np.testing.assert_equal(output_state, expected_output_state) + + def test_no_update(self): + # Test implicit state without updating any state + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32")) + inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.int32)) + inputs[1].set_data_from_numpy(np.asarray([1], dtype=np.int32)) + correlation_id = 3 + + # Make sure the state is never updated. + result_start = triton_client.infer( + model_name="no_state_update", + inputs=inputs, + sequence_id=correlation_id, + sequence_start=True, + ) + self.assertEqual(result_start.as_numpy("OUTPUT")[0], 1) + for _ in range(10): + result = triton_client.infer( + model_name="no_state_update", inputs=inputs, sequence_id=correlation_id + ) + self.assertEqual(result.as_numpy("OUTPUT")[0], 1) + + _ = triton_client.infer( + model_name="no_state_update", + inputs=inputs, + sequence_id=correlation_id, + sequence_end=True, + ) + self.assertEqual(result.as_numpy("OUTPUT")[0], 1) + + def test_request_output_not_allowed(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + + for backend in BACKENDS.split(" "): + inputs = [] + if backend.strip() == "libtorch": + inputs.append(tritonhttpclient.InferInput("INPUT__0", [1], "INT32")) + else: + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.int32)) + + outputs = [] + if backend.strip() == "libtorch": + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE__1")) + else: + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE")) + + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name=f"{backend}_nobatch_sequence_int32", + inputs=inputs, + outputs=outputs, + sequence_id=1, + sequence_start=True, + sequence_end=True, + ) + if backend.strip() == "libtorch": + self.assertIn( + "unexpected inference output 'OUTPUT_STATE__1' for model", + str(e.exception), + ) + else: + self.assertIn( + "unexpected inference output 'OUTPUT_STATE' for model", + str(e.exception), + ) + + def test_request_output(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + for backend in BACKENDS.split(" "): + inputs = [] + if backend.strip() == "libtorch": + inputs.append(tritonhttpclient.InferInput("INPUT__0", [1], "INT32")) + else: + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.int32)) + + outputs = [] + if backend.strip() == "libtorch": + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE__1")) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT__0")) + else: + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE")) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT")) + + result = triton_client.infer( + model_name=f"{backend}_nobatch_sequence_int32_output", + inputs=inputs, + outputs=outputs, + sequence_id=1, + sequence_start=True, + sequence_end=True, + ) + if backend.strip() == "libtorch": + self.assertTrue(result.as_numpy("OUTPUT_STATE__1")[0], 1) + self.assertTrue(result.as_numpy("OUTPUT__0")[0], 1) + else: + self.assertTrue(result.as_numpy("OUTPUT_STATE")[0], 1) + self.assertTrue(result.as_numpy("OUTPUT")[0], 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_implicit_state/models/growable_memory/config.pbtxt b/qa/L0_implicit_state/models/growable_memory/config.pbtxt new file mode 100644 index 0000000000..0a7920bdf1 --- /dev/null +++ b/qa/L0_implicit_state/models/growable_memory/config.pbtxt @@ -0,0 +1,103 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "growable_memory" +backend: "implicit_state" +max_batch_size: 0 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + } + ] + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT8 + dims: [1024, 1024] + use_same_buffer_for_input_output: true + use_growable_memory: true + } + ] +} + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "TEST_CASE" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "OUTPUT_STATE" + data_type: TYPE_INT8 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_GPU + } +] diff --git a/qa/L0_implicit_state/models/no_implicit_state/config.pbtxt b/qa/L0_implicit_state/models/no_implicit_state/config.pbtxt new file mode 100644 index 0000000000..e1540d36ed --- /dev/null +++ b/qa/L0_implicit_state/models/no_implicit_state/config.pbtxt @@ -0,0 +1,89 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "no_implicit_state" +backend: "implicit_state" +max_batch_size: 0 + +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + } + ] +} + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "TEST_CASE" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_implicit_state/models/no_state_update/config.pbtxt b/qa/L0_implicit_state/models/no_state_update/config.pbtxt new file mode 100644 index 0000000000..e7fb6afe8f --- /dev/null +++ b/qa/L0_implicit_state/models/no_state_update/config.pbtxt @@ -0,0 +1,102 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "no_state_update" +backend: "implicit_state" +max_batch_size: 0 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + } + ] + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT32 + dims: 1 + initial_state: { + name: "state init" + data_type: TYPE_INT32 + dims: 1 + zero_data: true + } + } + ] +} + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "TEST_CASE" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt new file mode 100644 index 0000000000..0f72d772a6 --- /dev/null +++ b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt @@ -0,0 +1,97 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "single_state_buffer" +backend: "implicit_state" +max_batch_size: 0 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + } + ] + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT32 + dims: 1 + use_same_buffer_for_input_output: true + } + ] +} + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "TEST_CASE" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_implicit_state/models/wrong_internal_state/config.pbtxt b/qa/L0_implicit_state/models/wrong_internal_state/config.pbtxt new file mode 100644 index 0000000000..afe55ecf14 --- /dev/null +++ b/qa/L0_implicit_state/models/wrong_internal_state/config.pbtxt @@ -0,0 +1,97 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "wrong_internal_state" +backend: "implicit_state" +max_batch_size: 0 + +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + } + ] + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT32 + dims: 1 + } + ] +} + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "TEST_CASE" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh new file mode 100755 index 0000000000..0722d29be1 --- /dev/null +++ b/qa/L0_implicit_state/test.sh @@ -0,0 +1,143 @@ +#!/bin/bash +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +TEST_RESULT_FILE='test_results.txt' + +export ENSEMBLES=0 +BACKENDS=${BACKENDS:="libtorch onnx plan"} +export BACKENDS +export IMPLICIT_STATE=1 +INITIAL_STATE_ZERO=${INITIAL_STATE_ZERO:="0"} +INITIAL_STATE_FILE=${INITIAL_STATE_FILE:="0"} +SINGLE_STATE_BUFFER=${SINGLE_STATE_BUFFER:="0"} + +export INITIAL_STATE_ZERO +export INITIAL_STATE_FILE +export SINGLE_STATE_BUFFER + +MODELDIR=${MODELDIR:=`pwd`/models} +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +source ../common/util.sh + +# Setup the custom models shared library +cp ./libtriton_implicit_state.so models/no_implicit_state/ +cp ./libtriton_implicit_state.so models/no_state_update/ +cp ./libtriton_implicit_state.so models/wrong_internal_state/ +cp ./libtriton_implicit_state.so models/single_state_buffer/ +cp ./libtriton_implicit_state.so models/growable_memory/ + +mkdir -p models/no_implicit_state/1/ +mkdir -p models/no_state_update/1/ +mkdir -p models/wrong_internal_state/1/ +mkdir -p models/single_state_buffer/1/ +mkdir -p models/growable_memory/1/ + +for BACKEND in $BACKENDS; do + dtype="int32" + model_name=${BACKEND}_nobatch_sequence_${dtype} + rm -rf models/$model_name + cp -r $DATADIR/qa_sequence_implicit_model_repository/$model_name models + output_dtype= + + # In order to allow the state to be returned, the model must describe + # state as one of the outputs of the model. + model_name_allow_output=${BACKEND}_nobatch_sequence_${dtype}_output + rm -rf models/$model_name_allow_output + cp -r $DATADIR/qa_sequence_implicit_model_repository/$model_name models/$model_name_allow_output + + if [ $BACKEND == "libtorch" ]; then + (cd models/$model_name_allow_output && \ + sed -i "s/^name:.*/name: \"$model_name_allow_output\"/" config.pbtxt && \ + echo -e "output [{ name: \"OUTPUT_STATE__1\" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]" >> config.pbtxt) + else + (cd models/$model_name_allow_output && \ + sed -i "s/^name:.*/name: \"$model_name_allow_output\"/" config.pbtxt && \ + echo -e "output [{ name: \"OUTPUT_STATE\" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]" >> config.pbtxt) + fi +done + +CLIENT_LOG=`pwd`/client.log +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --cuda-virtual-address-size=0:$((1024*1024*4))" +IMPLICIT_STATE_CLIENT='implicit_state.py' +EXPECTED_TEST_NUM=7 +rm -rf $CLIENT_LOG + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python3 $IMPLICIT_STATE_CLIENT > $CLIENT_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Implicit State FAILED\n***" + cat ${CLIENT_LOG} + exit 1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_TEST_NUM + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +(cd ../L0_sequence_batcher/ && bash -ex test.sh) +RET=$? + +if [ $RET == 0 ]; then + echo -e "\n***\n*** Implicit State Passed\n***" +else + echo -e "\n***\n*** Implicit State FAILED\n***" + exit 1 +fi + +exit $RET + diff --git a/qa/L0_infer/infer_test.py b/qa/L0_infer/infer_test.py old mode 100644 new mode 100755 index bcdfd0e694..c304917d9c --- a/qa/L0_infer/infer_test.py +++ b/qa/L0_infer/infer_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,219 +27,1139 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + sys.path.append("../common") -from builtins import range -from future.utils import iteritems +import os import unittest -import numpy as np + import infer_util as iu +import numpy as np import test_util as tu -from tensorrtserver.api import * +from tritonclient.utils import * + +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) +CPU_ONLY = os.environ.get("TRITON_SERVER_CPU_ONLY") is not None +TEST_VALGRIND = bool(int(os.environ.get("TEST_VALGRIND", 0))) + +USE_GRPC = os.environ.get("USE_GRPC", 1) != "0" +USE_HTTP = os.environ.get("USE_HTTP", 1) != "0" +assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero" + +BACKENDS = os.environ.get( + "BACKENDS", "graphdef savedmodel onnx libtorch plan python python_dlpack openvino" +) +ENSEMBLES = bool(int(os.environ.get("ENSEMBLES", 1))) +NOBATCH = bool(int(os.environ.get("NOBATCH", 1))) +BATCH = bool(int(os.environ.get("BATCH", 1))) + +np_dtype_string = np.dtype(object) + +# 60 sec is the default value +NETWORK_TIMEOUT = 300.0 if TEST_VALGRIND else 60.0 + + +class InferTest(tu.TestResultCollector): + def _full_exact( + self, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw, + output1_raw, + swap, + network_timeout=NETWORK_TIMEOUT, + ): + def _infer_exact_helper( + tester, + pf, + tensor_shape, + batch_size, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=True, + output1_raw=True, + model_version=None, + swap=False, + outputs=("OUTPUT0", "OUTPUT1"), + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_http_json_tensors=True, + skip_request_id_check=True, + use_streaming=True, + correlation_id=0, + network_timeout=NETWORK_TIMEOUT, + ): + for bs in (1, batch_size): + # model that does not support batching + if NOBATCH: + if bs == 1: + iu.infer_exact( + tester, + pf + "_nobatch", + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + use_http_json_tensors=use_http_json_tensors, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + network_timeout=network_timeout, + ) + + if BATCH: + # model that supports batching. + iu.infer_exact( + tester, + pf, + (bs,) + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + use_http_json_tensors=use_http_json_tensors, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + network_timeout=network_timeout, + ) -class InferTest(unittest.TestCase): - def _full_exact(self, req_raw, input_dtype, output0_dtype, output1_dtype, swap): input_size = 16 - if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype): - # model that supports batching - for bs in (1, 8): - iu.infer_exact(self, 'graphdef', (input_size,), bs, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) - iu.infer_exact(self, 'savedmodel', (input_size,), bs, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) - # model that does not batching - iu.infer_exact(self, 'graphdef_nobatch', (input_size,), 1, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) - iu.infer_exact(self, 'savedmodel_nobatch', (input_size,), 1, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) - - if tu.validate_for_c2_model(input_dtype, output0_dtype, output1_dtype): - # model that supports batching - for bs in (1, 8): - iu.infer_exact(self, 'netdef', (input_size,), bs, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) - # model that does not batching - iu.infer_exact(self, 'netdef_nobatch', (input_size,), 1, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) - - if tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype): - # model that supports batching - for bs in (1, 8): - iu.infer_exact(self, 'plan', (input_size, 1, 1), bs, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) - # model that does not batching - iu.infer_exact(self, 'plan_nobatch', (input_size, 1, 1), 1, req_raw, - input_dtype, output0_dtype, output1_dtype, swap=swap) + all_ensemble_prefix = ["simple_", "sequence_", "fan_"] + ensemble_prefix = [""] + if ENSEMBLES: + for prefix in all_ensemble_prefix: + if tu.validate_for_ensemble_model( + prefix, + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + ensemble_prefix.append(prefix) + + if tu.validate_for_tf_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + for prefix in ensemble_prefix: + for pf in ["graphdef", "savedmodel"]: + if pf in BACKENDS: + _infer_exact_helper( + self, + prefix + pf, + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + network_timeout=network_timeout, + ) + + if not CPU_ONLY and tu.validate_for_trt_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size, 1, 1), + (input_size, 1, 1), + (input_size, 1, 1), + ): + for prefix in ensemble_prefix: + if "plan" in BACKENDS: + if input_dtype == np.int8: + _infer_exact_helper( + self, + prefix + "plan", + (input_size, 1, 1), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + else: + _infer_exact_helper( + self, + prefix + "plan", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_onnx_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + for prefix in ensemble_prefix: + if "onnx" in BACKENDS: + _infer_exact_helper( + self, + prefix + "onnx", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_libtorch_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + # Due to PyTorch bug + # https://github.com/pytorch/pytorch/issues/66930 we can't + # run this test with int8 input and int32 outputs. + if ( + (input_dtype == np.int8) + and (output0_dtype == np.int32) + and (output1_dtype == np.int32) + ): + print("skipping pytorch test for int8_int32_int32") + else: + for prefix in ensemble_prefix: + if "libtorch" in BACKENDS: + # Skip batching for PyTorch String I/O + if ( + (input_dtype == np_dtype_string) + or (output0_dtype == np_dtype_string) + or (output1_dtype == np_dtype_string) + ): + iu.infer_exact( + self, + prefix + "libtorch_nobatch", + (input_size,), + 1, # batch_size + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + else: + _infer_exact_helper( + self, + prefix + "libtorch", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + for prefix in ensemble_prefix: + if prefix != "": + continue + if ( + input_dtype == np.uint8 + or output0_dtype == np.uint8 + or output1_dtype == np.uint8 + ): + continue + + if "python_dlpack" in BACKENDS: + _infer_exact_helper( + self, + prefix + "python_dlpack", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + elif "python" in BACKENDS: + _infer_exact_helper( + self, + prefix + "python", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + def test_raw_uuu(self): + self._full_exact( + np.uint8, np.uint8, np.uint8, output0_raw=True, output1_raw=True, swap=True + ) def test_raw_bbb(self): - self._full_exact(True, np.int8, np.int8, np.int8, swap=True) + self._full_exact( + np.int8, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=True + ) + def test_raw_sss(self): - self._full_exact(True, np.int16, np.int16, np.int16, swap=True) + self._full_exact( + np.int16, np.int16, np.int16, output0_raw=True, output1_raw=True, swap=True + ) + def test_raw_iii(self): - self._full_exact(True, np.int32, np.int32, np.int32, swap=True) + self._full_exact( + np.int32, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=True + ) + def test_raw_lll(self): - self._full_exact(True, np.int64, np.int64, np.int64, swap=False) + self._full_exact( + np.int64, np.int64, np.int64, output0_raw=True, output1_raw=True, swap=False + ) + def test_raw_hhh(self): - self._full_exact(True, np.float16, np.float16, np.float16, swap=False) + self._full_exact( + np.float16, + np.float16, + np.float16, + output0_raw=True, + output1_raw=True, + swap=False, + ) + def test_raw_fff(self): - self._full_exact(True, np.float32, np.float32, np.float32, swap=True) + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=True, + ) + def test_raw_hff(self): - self._full_exact(True, np.float16, np.float32, np.float32, swap=False) + self._full_exact( + np.float16, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + def test_raw_bii(self): - self._full_exact(True, np.int8, np.int32, np.int32, swap=False) + self._full_exact( + np.int8, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=False + ) + def test_raw_ibb(self): - self._full_exact(True, np.int32, np.int8, np.int8, swap=False) + self._full_exact( + np.int32, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=False + ) + def test_raw_ibs(self): - self._full_exact(True, np.int32, np.int8, np.int16, swap=False) + self._full_exact( + np.int32, np.int8, np.int16, output0_raw=True, output1_raw=True, swap=False + ) + + def test_raw_fuu(self): + self._full_exact( + np.float32, + np.uint8, + np.uint8, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_uff(self): + self._full_exact( + np.uint8, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_fuh(self): + self._full_exact( + np.float32, + np.uint8, + np.float16, + output0_raw=True, + output1_raw=True, + swap=False, + ) + def test_raw_iff(self): - self._full_exact(True, np.int32, np.float32, np.float32, swap=False) + self._full_exact( + np.int32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + def test_raw_fii(self): - self._full_exact(True, np.float32, np.int32, np.int32, swap=False) + self._full_exact( + np.float32, + np.int32, + np.int32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + def test_raw_ihs(self): - self._full_exact(True, np.int32, np.float16, np.int16, swap=False) - - def test_class_bbb(self): - self._full_exact(False, np.int8, np.int8, np.int8, swap=True) - def test_class_sss(self): - self._full_exact(False, np.int16, np.int16, np.int16, swap=True) - def test_class_iii(self): - self._full_exact(False, np.int32, np.int32, np.int32, swap=True) - def test_class_lll(self): - self._full_exact(False, np.int64, np.int64, np.int64, swap=False) - def test_class_fff(self): - self._full_exact(False, np.float32, np.float32, np.float32, swap=True) - def test_class_iff(self): - self._full_exact(False, np.int32, np.float32, np.float32, swap=False) + self._full_exact( + np.int32, + np.float16, + np.int16, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_ooo(self): + self._full_exact( + np_dtype_string, + np_dtype_string, + np_dtype_string, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_oii(self): + self._full_exact( + np_dtype_string, + np.int32, + np.int32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_oio(self): + self._full_exact( + np_dtype_string, + np.int32, + np_dtype_string, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_ooi(self): + self._full_exact( + np_dtype_string, + np_dtype_string, + np.int32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_ioo(self): + self._full_exact( + np.int32, + np_dtype_string, + np_dtype_string, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_iio(self): + self._full_exact( + np.int32, + np.int32, + np_dtype_string, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_ioi(self): + self._full_exact( + np.int32, + np_dtype_string, + np.int32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + # shared memory does not support class output + if not (TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY): + + def test_class_bbb(self): + self._full_exact( + np.int8, + np.int8, + np.int8, + output0_raw=False, + output1_raw=False, + swap=True, + ) + + def test_class_sss(self): + self._full_exact( + np.int16, + np.int16, + np.int16, + output0_raw=False, + output1_raw=False, + swap=True, + ) + + def test_class_iii(self): + self._full_exact( + np.int32, + np.int32, + np.int32, + output0_raw=False, + output1_raw=False, + swap=True, + ) + + def test_class_lll(self): + self._full_exact( + np.int64, + np.int64, + np.int64, + output0_raw=False, + output1_raw=False, + swap=False, + ) + + def test_class_fff(self): + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + swap=True, + ) + + def test_class_iff(self): + self._full_exact( + np.int32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + swap=False, + ) + + def test_mix_bbb(self): + self._full_exact( + np.int8, + np.int8, + np.int8, + output0_raw=True, + output1_raw=False, + swap=True, + ) + + def test_mix_sss(self): + self._full_exact( + np.int16, + np.int16, + np.int16, + output0_raw=False, + output1_raw=True, + swap=True, + ) + + def test_mix_iii(self): + self._full_exact( + np.int32, + np.int32, + np.int32, + output0_raw=True, + output1_raw=False, + swap=True, + ) + + def test_mix_lll(self): + self._full_exact( + np.int64, + np.int64, + np.int64, + output0_raw=False, + output1_raw=True, + swap=False, + ) + + def test_mix_fff(self): + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=False, + swap=True, + ) + + def test_mix_iff(self): + self._full_exact( + np.int32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=True, + swap=False, + ) def test_raw_version_latest_1(self): input_size = 16 - tensor_shape = (input_size,) + tensor_shape = (1, input_size) # There are 3 versions of graphdef_int8_int8_int8 but # only version 3 should be available - for platform in ('graphdef', 'savedmodel'): + for platform in ("graphdef", "savedmodel"): + if platform not in BACKENDS: + continue try: - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int8, np.int8, np.int8, - model_version=1, swap=False) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int8, + np.int8, + np.int8, + model_version=1, + swap=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("Inference request for unknown model")) + self.assertTrue(ex.message().startswith("Request for unknown model")) try: - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int8, np.int8, np.int8, - model_version=2, swap=True) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int8, + np.int8, + np.int8, + model_version=2, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("Inference request for unknown model")) + self.assertTrue(ex.message().startswith("Request for unknown model")) - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int8, np.int8, np.int8, - model_version=3, swap=True) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int8, + np.int8, + np.int8, + model_version=3, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) def test_raw_version_latest_2(self): input_size = 16 - tensor_shape = (input_size,) + tensor_shape = (1, input_size) # There are 3 versions of graphdef_int16_int16_int16 but only # versions 2 and 3 should be available - for platform in ('graphdef', 'savedmodel'): + for platform in ("graphdef", "savedmodel"): + if platform not in BACKENDS: + continue try: - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int16, np.int16, np.int16, - model_version=1, swap=False) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int16, + np.int16, + np.int16, + model_version=1, + swap=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("Inference request for unknown model")) - - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int16, np.int16, np.int16, - model_version=2, swap=True) - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int16, np.int16, np.int16, - model_version=3, swap=True) + self.assertTrue(ex.message().startswith("Request for unknown model")) + + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int16, + np.int16, + np.int16, + model_version=2, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int16, + np.int16, + np.int16, + model_version=3, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) def test_raw_version_all(self): input_size = 16 - tensor_shape = (input_size,) + tensor_shape = (1, input_size) # There are 3 versions of *_int32_int32_int32 and all should # be available. - for platform in ('graphdef', 'savedmodel', 'netdef'): - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int32, np.int32, np.int32, - model_version=1, swap=False) - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int32, np.int32, np.int32, - model_version=2, swap=True) - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int32, np.int32, np.int32, - model_version=3, swap=True) + for platform in ("graphdef", "savedmodel"): + if platform not in BACKENDS: + continue + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + model_version=1, + swap=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + model_version=2, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + model_version=3, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) def test_raw_version_specific_1(self): input_size = 16 - tensor_shape = (input_size,) + tensor_shape = (1, input_size) # There are 3 versions of *_float16_float16_float16 but only # version 1 should be available. - for platform in ('graphdef', 'savedmodel'): - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.float16, np.float16, np.float16, - model_version=1, swap=False) + for platform in ("graphdef", "savedmodel"): + if platform not in BACKENDS: + continue + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float16, + np.float16, + np.float16, + model_version=1, + swap=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) try: - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.float16, np.float16, np.float16, - model_version=2, swap=True) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float16, + np.float16, + np.float16, + model_version=2, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("Inference request for unknown model")) + self.assertTrue(ex.message().startswith("Request for unknown model")) try: - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.float16, np.float16, np.float16, - model_version=3, swap=True) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float16, + np.float16, + np.float16, + model_version=3, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("Inference request for unknown model")) + self.assertTrue(ex.message().startswith("Request for unknown model")) def test_raw_version_specific_1_3(self): input_size = 16 # There are 3 versions of *_float32_float32_float32 but only # versions 1 and 3 should be available. - for platform in ('graphdef', 'savedmodel', 'netdef', 'plan'): - tensor_shape = (input_size, 1, 1) if platform == 'plan' else (input_size,) - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.float32, np.float32, np.float32, - model_version=1, swap=False) + for platform in ("graphdef", "savedmodel", "plan"): + if platform == "plan" and CPU_ONLY: + continue + if platform not in BACKENDS: + continue + tensor_shape = (1, input_size) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=1, + swap=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) try: - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.float32, np.float32, np.float32, - model_version=2, swap=True) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=2, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("Inference request for unknown model")) + self.assertTrue(ex.message().startswith("Request for unknown model")) + + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=3, + swap=True, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if ENSEMBLES: + if all(x in BACKENDS for x in ["graphdef", "savedmodel"]): + + def test_ensemble_mix_platform(self): + # Skip on CPU only machine as TensorRT model is used in this ensemble + if CPU_ONLY: + return + for bs in (1, 8): + iu.infer_exact( + self, + "mix_platform", + (bs, 16), + bs, + np.float32, + np.float32, + np.float32, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if "graphdef" in BACKENDS: + + def test_ensemble_mix_type(self): + for bs in (1, 8): + iu.infer_exact( + self, + "mix_type", + (bs, 16), + bs, + np.int32, + np.float32, + np.float32, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if all(x in BACKENDS for x in ["graphdef", "savedmodel"]): + + def test_ensemble_mix_ensemble(self): + for bs in (1, 8): + iu.infer_exact( + self, + "mix_ensemble", + (bs, 16), + bs, + np.int32, + np.float32, + np.float32, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if all( + x in BACKENDS + for x in [ + "graphdef", + ] + ): + + def test_ensemble_mix_batch_nobatch(self): + base_names = ["batch_to_nobatch", "nobatch_to_batch"] + for name in base_names: + for bs in (1, 8): + iu.infer_exact( + self, + name, + (bs, 16), + bs, + np.float32, + np.float32, + np.float32, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + iu.infer_exact( + self, + name + "_nobatch", + (8, 16), + 1, + np.float32, + np.float32, + np.float32, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + # batch -> nobatch -> batch + for bs in (1, 8): + iu.infer_exact( + self, + "mix_nobatch_batch", + (bs, 16), + bs, + np.float32, + np.float32, + np.float32, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if not (TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY): + + def test_ensemble_label_lookup(self): + if all(x in BACKENDS for x in ["graphdef", "savedmodel"]): + # Ensemble needs to look up label from the actual model + for bs in (1, 8): + iu.infer_exact( + self, + "mix_platform", + (bs, 16), + bs, + np.float32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if all(x in BACKENDS for x in ["graphdef", "savedmodel"]): + # Label from the actual model will be passed along the nested ensemble + for bs in (1, 8): + iu.infer_exact( + self, + "mix_ensemble", + (bs, 16), + bs, + np.int32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if "graphdef" in BACKENDS: + # If label file is provided, it will use the provided label file directly + try: + iu.infer_exact( + self, + "wrong_label", + (1, 16), + 1, + np.int32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + except AssertionError: + # Sanity check that infer_exact failed since this ensemble is provided + # with unexpected labels + pass - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.float32, np.float32, np.float32, - model_version=3, swap=True) + if "graphdef" in BACKENDS: + for bs in (1, 8): + iu.infer_exact( + self, + "label_override", + (bs, 16), + bs, + np.int32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + use_http=USE_HTTP, + use_grpc=USE_GRPC, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_infer/install_and_test.sh b/qa/L0_infer/install_and_test.sh new file mode 100755 index 0000000000..4c136cf1dd --- /dev/null +++ b/qa/L0_infer/install_and_test.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Determine the operating system to call the correct package manager. +ID_LIKE=$(grep -Po '(?<=ID_LIKE=).*' /etc/os-release | awk -F= '{print $1}' | tr -d '"' | awk '{print $1}') + +# Note: This script is to be used with customized triton containers that need +# dependencies to run L0_infer tests +if [[ "$ID_LIKE" =~ "debian" ]]; then + apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + jq \ + python3 \ + python3-pip +else + yum install -y \ + jq \ + curl +fi + +pip3 install --upgrade pip +# install client libraries +pip3 install tritonclient[all] + +# Run the actual test +bash -x test.sh diff --git a/qa/L0_infer/test.sh b/qa/L0_infer/test.sh index 50e9870497..36f63053e3 100755 --- a/qa/L0_infer/test.sh +++ b/qa/L0_infer/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,52 +25,418 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_LOG="./client.log" +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +ldconfig || true + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +CLIENT_LOG_BASE="./client" INFER_TEST=infer_test.py +SERVER_TIMEOUT=${SERVER_TIMEOUT:=600} -DATADIR=/data/inferenceserver +if [ -z "$TEST_SYSTEM_SHARED_MEMORY" ]; then + TEST_SYSTEM_SHARED_MEMORY="0" +fi -SERVER=/opt/tensorrtserver/bin/trtserver -SERVER_ARGS=--model-store=$DATADIR/qa_model_repository -SERVER_LOG="./inference_server.log" -source ../common/util.sh +if [ -z "$TEST_CUDA_SHARED_MEMORY" ]; then + TEST_CUDA_SHARED_MEMORY="0" +fi -run_server -if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 +if [ -z "$TEST_VALGRIND" ]; then + TEST_VALGRIND="0" fi -RET=0 +if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG_BASE="./valgrind_test" + LEAKCHECK=/usr/bin/valgrind + LEAKCHECK_ARGS_BASE="--leak-check=full --show-leak-kinds=definite --max-threads=3000 --num-callers=20" + SERVER_TIMEOUT=4000 + rm -f $LEAKCHECK_LOG_BASE* + # Remove 'python', 'python_dlpack' and 'onnx' from BACKENDS and test them + # separately below. + BACKENDS="graphdef savedmodel libtorch plan openvino" +fi + +if [ "$TEST_SYSTEM_SHARED_MEMORY" -eq 1 ] || [ "$TEST_CUDA_SHARED_MEMORY" -eq 1 ]; then + EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="33"} +else + EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="46"} +fi -set +e +TF_VERSION=${TF_VERSION:=2} +TEST_JETSON=${TEST_JETSON:=0} -# python unittest seems to swallow ImportError and still return 0 exit -# code. So need to explicitly check CLIENT_LOG to make sure we see -# some running tests -rm -f $CLIENT_LOG -python $INFER_TEST >$CLIENT_LOG 2>&1 -if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Failed\n***" - RET=1 +# Default size (in MB) of shared memory to be used by each python model +# instance (Default is 1MB) +DEFAULT_SHM_SIZE_MB=${DEFAULT_SHM_SIZE_MB:=1} +DEFAULT_SHM_SIZE_BYTES=$((1024*1024*$DEFAULT_SHM_SIZE_MB)) + +# On windows the paths invoked by the script (running in WSL) must use +# /mnt/c when needed but the paths on the tritonserver command-line +# must be C:/ style. +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then + MODELDIR=${MODELDIR:=C:/models} + DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} + BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} + SERVER=${SERVER:=/mnt/c/tritonserver/bin/tritonserver.exe} +else + MODELDIR=${MODELDIR:=`pwd`/models} + DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} + SERVER=${TRITON_DIR}/bin/tritonserver + BACKEND_DIR=${TRITON_DIR}/backends + + # PyTorch on SBSA requires libgomp to be loaded first. See the following + # GitHub issue for more information: + # https://github.com/pytorch/pytorch/issues/2575 + arch=`uname -m` + if [ $arch = "aarch64" ]; then + SERVER_LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libgomp.so.1 + fi fi -grep -c "HTTP/1.1 200 OK" $CLIENT_LOG -if [ $? -ne 0 ]; then - cat $CLIENT_LOG +# Allow more time to exit. Ensemble brings in too many models +SERVER_ARGS_EXTRA="--exit-timeout-secs=${SERVER_TIMEOUT} --backend-directory=${BACKEND_DIR} --backend-config=tensorflow,version=${TF_VERSION} --backend-config=python,stub-timeout-seconds=120 --backend-config=python,shm-default-byte-size=${DEFAULT_SHM_SIZE_BYTES}" +SERVER_ARGS="--model-repository=${MODELDIR} ${SERVER_ARGS_EXTRA}" +SERVER_LOG_BASE="./inference_server" +source ../common/util.sh + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG_BASE* + +RET=0 + +# Verify the flag is set only on CPU-only device +if [ "$TRITON_SERVER_CPU_ONLY" == "1" ]; then + gpu_count=`nvidia-smi -L | grep GPU | wc -l` + if [ "$gpu_count" -ne 0 ]; then + echo -e "\n***\n*** Running on a device with GPU\n***" echo -e "\n***\n*** Test Failed To Run\n***" - RET=1 + exit 1 + fi fi -set -e +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch plan python python_dlpack openvino"} +export BACKENDS + +# If ENSEMBLES not specified, set to 1 +ENSEMBLES=${ENSEMBLES:="1"} +export ENSEMBLES + +# Test for both batch and nobatch models +NOBATCH=${NOBATCH:="1"} +export NOBATCH +BATCH=${BATCH:="1"} +export BATCH + +if [[ $BACKENDS == *"python_dlpack"* ]]; then + if [[ "aarch64" != $(uname -m) ]] ; then + pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + else + pip3 install torch==1.13.0 -f https://download.pytorch.org/whl/torch_stable.html + fi +fi + +function generate_model_repository() { + rm -fr models && mkdir models + for BACKEND in $BACKENDS; do + if [ "$BACKEND" == "python" ] || [ "$BACKEND" == "python_dlpack" ]; then + # We will be using ONNX models config.pbtxt and tweak them to make them + # appropriate for Python backend + onnx_models=`find ${DATADIR}/qa_model_repository/ -maxdepth 1 -type d -regex '.*onnx_.*'` + + # Types that need to use SubAdd instead of AddSub + swap_types="float32 int32 int16 int8" + for onnx_model in $onnx_models; do + if [ "$BACKEND" == "python_dlpack" ]; then + python_model=`echo $onnx_model | sed 's/onnx/python_dlpack/g' | sed 's,'"$DATADIR/qa_model_repository/"',,g'` + else + python_model=`echo $onnx_model | sed 's/onnx/python/g' | sed 's,'"$DATADIR/qa_model_repository/"',,g'` + fi + + mkdir -p models/$python_model/1/ + # Remove platform and use Python as the backend + if [ "$BACKEND" == "python" ]; then + cat $onnx_model/config.pbtxt | sed 's/platform:.*//g' | sed 's/version_policy.*/backend:\ "python"/g' | sed 's/onnx/python/g' > models/$python_model/config.pbtxt + else + cat $onnx_model/config.pbtxt | sed 's/platform:.*//g' | sed 's/version_policy.*/backend:\ "python"/g' | sed 's/onnx/python_dlpack/g' > models/$python_model/config.pbtxt + fi + cp $onnx_model/output0_labels.txt models/$python_model + + is_swap_type="0" + + # Check whether this model needs to be swapped + for swap_type in $swap_types; do + model_type="$swap_type"_"$swap_type"_"$swap_type" + if [ "$BACKEND" == "python_dlpack" ]; then + model_name=python_dlpack_$model_type + model_name_nobatch=python_dlpack_nobatch_$model_type + if [ $python_model == $model_name ] || [ $python_model == $model_name_nobatch ]; then + cp ../python_models/dlpack_sub_add/model.py models/$python_model/1/ + is_swap_type="1" + fi + else + model_name=python_$model_type + model_name_nobatch=python_nobatch_$model_type + if [ $python_model == $model_name ] || [ $python_model == $model_name_nobatch ]; then + cp ../python_models/sub_add/model.py models/$python_model/1/ + is_swap_type="1" + fi + fi + done -kill $SERVER_PID -wait $SERVER_PID + # Use the AddSub model if it doesn't need to be swapped + if [ $is_swap_type == "0" ]; then + if [ "$BACKEND" == "python_dlpack" ]; then + cp ../python_models/dlpack_add_sub/model.py models/$python_model/1/ + else + cp ../python_models/add_sub/model.py models/$python_model/1/ + fi + fi + done + elif [ "$BACKEND" == "plan" ] && [ "$TRITON_SERVER_CPU_ONLY" == "1" ]; then + # skip plan_tensorrt models since they don't run on CPU only containers + continue + else + cp -r ${DATADIR}/qa_model_repository/${BACKEND}* \ + models/. + fi + done + + if [ "$ENSEMBLES" == "1" ]; then + + # Copy identity backend models and ensembles + for BACKEND in $BACKENDS; do + if [ "$BACKEND" == "plan" ] && [ "$TRITON_SERVER_CPU_ONLY" == "1" ]; then + # skip plan_tensorrt models since they don't run on CPU only containers + continue + elif [ "$BACKEND" != "python" ] && [ "$BACKEND" != "python_dlpack" ] && [ "$BACKEND" != "openvino" ]; then + cp -r ${DATADIR}/qa_ensemble_model_repository/qa_model_repository/*${BACKEND}* \ + models/. + fi + done + + cp -r ${DATADIR}/qa_ensemble_model_repository/qa_model_repository/nop_* \ + models/. + + create_nop_version_dir `pwd`/models + + if [[ $BACKENDS == *"graphdef"* ]]; then + ENSEMBLE_MODELS="wrong_label_int32_float32_float32 label_override_int32_float32_float32 mix_type_int32_float32_float32" + + ENSEMBLE_MODELS="${ENSEMBLE_MODELS} batch_to_nobatch_float32_float32_float32 batch_to_nobatch_nobatch_float32_float32_float32 nobatch_to_batch_float32_float32_float32 nobatch_to_batch_nobatch_float32_float32_float32 mix_nobatch_batch_float32_float32_float32" + + if [[ $BACKENDS == *"savedmodel"* ]] ; then + ENSEMBLE_MODELS="${ENSEMBLE_MODELS} mix_platform_float32_float32_float32 mix_ensemble_int32_float32_float32" + fi + + for EM in $ENSEMBLE_MODELS; do + mkdir -p ../ensemble_models/$EM/1 && cp -r ../ensemble_models/$EM models/. + done + fi + fi + + KIND="KIND_GPU" && [[ "$TARGET" == "cpu" ]] && KIND="KIND_CPU" + for FW in $BACKENDS; do + if [ "$FW" == "onnx" ] && [ "$TEST_VALGRIND" -eq 1 ]; then + # Reduce the instance count to make loading onnx models faster + for MC in `ls models/${FW}*/config.pbtxt`; do + echo "instance_group [ { kind: ${KIND} count: 1 }]" >> $MC + done + elif [ "$FW" != "plan" ] && [ "$FW" != "python" ] && [ "$FW" != "python_dlpack" ] && [ "$FW" != "openvino" ];then + for MC in `ls models/${FW}*/config.pbtxt`; do + echo "instance_group [ { kind: ${KIND} }]" >> $MC + done + elif [ "$FW" == "python" ] || [ "$FW" == "python_dlpack" ] || [ "$FW" == "openvino" ]; then + for MC in `ls models/${FW}*/config.pbtxt`; do + echo "instance_group [ { kind: KIND_CPU }]" >> $MC + done + fi + done + + # Modify custom_zero_1_float32 and custom_nobatch_zero_1_float32 for relevant ensembles + # This is done after the instance group change above so that identity backend models + # are run on CPU. Skip for Windows test. + cp -r ../custom_models/custom_zero_1_float32 models/. &&\ + mkdir -p models/custom_zero_1_float32/1 && \ + (cd models/custom_zero_1_float32 && \ + echo "instance_group [ { kind: KIND_CPU }]" >> config.pbtxt) + cp -r models/custom_zero_1_float32 models/custom_nobatch_zero_1_float32 && \ + (cd models/custom_zero_1_float32 && \ + sed -i "s/max_batch_size: 1/max_batch_size: 8/" config.pbtxt && \ + sed -i "s/dims: \[ 1 \]/dims: \[ -1 \]/" config.pbtxt) && \ + (cd models/custom_nobatch_zero_1_float32 && \ + sed -i "s/custom_zero_1_float32/custom_nobatch_zero_1_float32/" config.pbtxt && \ + sed -i "s/max_batch_size: 1/max_batch_size: 0/" config.pbtxt && \ + sed -i "s/dims: \[ 1 \]/dims: \[ -1, -1 \]/" config.pbtxt) + +} + +for TARGET in cpu gpu; do + if [ "$TRITON_SERVER_CPU_ONLY" == "1" ]; then + if [ "$TARGET" == "gpu" ]; then + echo -e "Skip GPU testing on CPU-only device" + continue + fi + fi + + SERVER_LOG=$SERVER_LOG_BASE.${TARGET}.log + CLIENT_LOG=$CLIENT_LOG_BASE.${TARGET}.log + + generate_model_repository + + # Check if running a memory leak check + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG=$LEAKCHECK_LOG_BASE.${TARGET}.log + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python3 $INFER_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + set -e + + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e +done + +# Run 'python', 'python_dlpack' and 'onnx' models separately in valgrind test. +# Loading python and python_dlpack models has OOM issue when running with +# valgrind, so loading only batch or nobatch models for each time. +# Loading all the onnx models at once requires more than 12 hours. Loading them +# separately to reduce the loading time. +if [ "$TEST_VALGRIND" -eq 1 ]; then + TESTING_BACKENDS="python python_dlpack onnx" + EXPECTED_NUM_TESTS=42 + if [[ "aarch64" != $(uname -m) ]] ; then + pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html + else + pip3 install torch==1.13.0 -f https://download.pytorch.org/whl/torch_stable.html + fi + + for BACKENDS in $TESTING_BACKENDS; do + export BACKENDS + for TARGET in cpu gpu; do + rm -fr *models + generate_model_repository + mkdir nobatch_models + mv ./models/*nobatch_* ./nobatch_models/. + cp -fr ./models/nop_* ./nobatch_models/. + + for BATCHING_MODE in batch nobatch; do + if [ "$TRITON_SERVER_CPU_ONLY" == "1" ]; then + if [ "$TARGET" == "gpu" ]; then + echo -e "Skip GPU testing on CPU-only device" + continue + fi + fi + + SERVER_LOG=$SERVER_LOG_BASE.${TARGET}.${BACKENDS}.${BATCHING_MODE}.log + CLIENT_LOG=$CLIENT_LOG_BASE.${TARGET}.${BACKENDS}.${BATCHING_MODE}.log + + if [ "$BATCHING_MODE" == "batch" ]; then + NOBATCH="0" + export NOBATCH + BATCH="1" + export BATCH + MODELDIR=`pwd`/models + else + NOBATCH="1" + export NOBATCH + BATCH="0" + export BATCH + MODELDIR=`pwd`/nobatch_models + fi + + SERVER_ARGS="--model-repository=${MODELDIR} ${SERVER_ARGS_EXTRA}" + LEAKCHECK_LOG=$LEAKCHECK_LOG_BASE.${TARGET}.${BACKENDS}.${BATCHING_MODE}.log + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python3 $INFER_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + set -e + + kill_server + + set +e + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + set -e + done + done + done +fi if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" fi exit $RET diff --git a/qa/L0_infer_reshape/infer_reshape_test.py b/qa/L0_infer_reshape/infer_reshape_test.py new file mode 100755 index 0000000000..e77dcbecaf --- /dev/null +++ b/qa/L0_infer_reshape/infer_reshape_test.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + +np_dtype_string = np.dtype(object) + +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) + + +class InferReshapeTest(tu.TestResultCollector): + def _full_reshape(self, dtype, input_shapes, output_shapes=None, no_batch=True): + # 'shapes' is list of shapes, one for each input. + if output_shapes is None: + output_shapes = input_shapes + + # For validation assume any shape can be used... + if tu.validate_for_tf_model( + dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0] + ): + # model that supports batching + for bs in (1, 8): + full_shapes = [ + [ + bs, + ] + + input_shape + for input_shape in input_shapes + ] + full_output_shapes = [ + [ + bs, + ] + + output_shape + for output_shape in output_shapes + ] + iu.infer_zero( + self, + "graphdef", + bs, + dtype, + full_shapes, + full_output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + iu.infer_zero( + self, + "savedmodel", + bs, + dtype, + full_shapes, + full_output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + # model that does not support batching + if no_batch: + iu.infer_zero( + self, + "graphdef_nobatch", + 1, + dtype, + input_shapes, + output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + iu.infer_zero( + self, + "savedmodel_nobatch", + 1, + dtype, + input_shapes, + output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if tu.validate_for_onnx_model( + dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0] + ): + # model that supports batching + for bs in (1, 8): + full_shapes = [ + [ + bs, + ] + + input_shape + for input_shape in input_shapes + ] + full_output_shapes = [ + [ + bs, + ] + + output_shape + for output_shape in output_shapes + ] + iu.infer_zero( + self, + "onnx", + bs, + dtype, + full_shapes, + full_output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + # model that does not support batching + if no_batch: + iu.infer_zero( + self, + "onnx_nobatch", + 1, + dtype, + input_shapes, + output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + if tu.validate_for_libtorch_model( + dtype, + dtype, + dtype, + input_shapes[0], + input_shapes[0], + input_shapes[0], + reshape=True, + ): + # skip variable size reshape on libtorch for now, + # see "gen_qa_reshape_model.py" for detail + if dtype != np.int32: + # model that does not support batching + # skip for libtorch string I/O + if no_batch and (dtype != np_dtype_string): + iu.infer_zero( + self, + "libtorch_nobatch", + 1, + dtype, + input_shapes, + output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + # model that supports batching + for bs in (1, 8): + full_shapes = [ + [ + bs, + ] + + input_shape + for input_shape in input_shapes + ] + full_output_shapes = [ + [ + bs, + ] + + output_shape + for output_shape in output_shapes + ] + iu.infer_zero( + self, + "libtorch", + bs, + dtype, + full_shapes, + full_output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + for name in ["simple_reshape", "sequence_reshape", "fan_reshape"]: + # [TODO] Skip variable size reshape on ensemble for now. + # Need rework on how ensemble for reshape are generated + if dtype == np.int32: + break + if tu.validate_for_ensemble_model( + name, + dtype, + dtype, + dtype, + input_shapes[0], + input_shapes[0], + input_shapes[0], + ): + # model that supports batching + for bs in (1, 8): + full_shapes = [ + [ + bs, + ] + + input_shape + for input_shape in input_shapes + ] + full_output_shapes = [ + [ + bs, + ] + + output_shape + for output_shape in output_shapes + ] + iu.infer_zero( + self, + name, + bs, + dtype, + full_shapes, + full_output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + # model that does not support batching + if no_batch: + iu.infer_zero( + self, + name + "_nobatch", + 1, + dtype, + input_shapes, + output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + def _trt_reshape(self, dtype, input_shapes, output_shapes=None, no_batch=True): + # 'shapes' is list of shapes, one for each input. + if output_shapes is None: + output_shapes = input_shapes + + if tu.validate_for_trt_model( + dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0] + ): + # model that supports batching + for bs in (1, 8): + full_shapes = [ + [ + bs, + ] + + input_shape + for input_shape in input_shapes + ] + full_output_shapes = [ + [ + bs, + ] + + output_shape + for output_shape in output_shapes + ] + iu.infer_zero( + self, + "plan", + bs, + dtype, + full_shapes, + full_output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + # model that does not support batching + if no_batch: + iu.infer_zero( + self, + "plan_nobatch", + 1, + dtype, + input_shapes, + output_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + def test_ff1(self): + self._full_reshape(np.float32, input_shapes=([1],), no_batch=False) + + def test_ff2(self): + self._full_reshape(np.float32, input_shapes=([1], [8]), no_batch=False) + self._trt_reshape(np.float32, input_shapes=([1], [8])) + + def test_ff3(self): + self._full_reshape(np.float32, input_shapes=([4, 4], [2], [2, 2, 3])) + + def test_ff4(self): + self._full_reshape( + np.float32, + input_shapes=([4, 4], [2], [2, 2, 3], [1]), + output_shapes=([16], [1, 2], [3, 2, 2], [1]), + ) + self._trt_reshape( + np.float32, + input_shapes=([4, 4], [2], [2, 2, 3], [1]), + output_shapes=([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1, 1]), + ) + + def test_ii1(self): + self._full_reshape(np.int32, input_shapes=([2, 4, 5, 6],)) + + def test_ii2(self): + self._full_reshape( + np.int32, input_shapes=([4, 1], [2]), output_shapes=([1, 4], [1, 2]) + ) + + def test_ii3(self): + self._full_reshape(np.int32, input_shapes=([1, 4, 1], [8], [2, 2, 3])) + + def test_oo1(self): + self._full_reshape(np.object_, input_shapes=([1],), no_batch=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_infer_reshape/test.sh b/qa/L0_infer_reshape/test.sh new file mode 100755 index 0000000000..218be954d9 --- /dev/null +++ b/qa/L0_infer_reshape/test.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +INFER_TEST=infer_reshape_test.py +EXPECTED_NUM_TESTS="8" +TEST_RESULT_FILE='test_results.txt' +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f $SERVER_LOG $CLIENT_LOG +rm -fr models && mkdir models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_reshape_model_repository/* models/. && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_reshape_model_repository/* \ + models/. +for i in \ + nobatch_zero_3_float32 \ + nobatch_zero_4_float32 \ + zero_1_float32 \ + zero_2_float32 \ + zero_3_float32 \ + zero_4_float32 \ + nobatch_zero_1_int32 \ + nobatch_zero_2_int32 \ + nobatch_zero_3_int32 \ + zero_1_int32 \ + zero_2_int32 \ + zero_3_int32 ; do + cp -r models/graphdef_${i} models/custom_${i} + rm -fr models/custom_${i}/1/* + (cd models/custom_${i} && \ + sed -i "s/^platform:.*/backend: \"identity\"/" config.pbtxt && \ + sed -i "s/^name:.*/name: \"custom_${i}\"/" config.pbtxt && \ + echo "instance_group [ { kind: KIND_CPU }]" >> config.pbtxt) +done + +create_nop_version_dir `pwd`/models + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# python unittest seems to swallow ImportError and still return 0 +# exit code. So need to explicitly check CLIENT_LOG to make sure +# we see some running tests +python $INFER_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_infer_variable/infer_variable_test.py b/qa/L0_infer_variable/infer_variable_test.py new file mode 100755 index 0000000000..e5e6470a3c --- /dev/null +++ b/qa/L0_infer_variable/infer_variable_test.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + +np_dtype_string = np.dtype(object) + +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) + + +class InferVariableTest(tu.TestResultCollector): + def _full_exact( + self, + input_dtype, + output0_dtype, + output1_dtype, + input_shape, + output0_shape, + output1_shape, + output0_raw=True, + output1_raw=True, + swap=False, + ): + def _infer_exact_helper( + tester, + pf, + tensor_shape, + batch_size, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=True, + output1_raw=True, + model_version=None, + swap=False, + outputs=("OUTPUT0", "OUTPUT1"), + use_http=True, + use_grpc=True, + skip_request_id_check=False, + use_streaming=True, + correlation_id=0, + ): + for bs in (1, batch_size): + # model that does not support batching + if bs == 1: + iu.infer_exact( + tester, + pf + "_nobatch", + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + # model that supports batching. Skip for libtorch string I/O + elif pf == "libtorch" and tu.validate_for_libtorch_model( + input_dtype, + output0_dtype, + output1_dtype, + tensor_shape, + tensor_shape, + tensor_shape, + bs, + ): + iu.infer_exact( + tester, + pf, + (bs,) + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + all_ensemble_prefix = ["simple_", "sequence_", "fan_"] + ensemble_prefix = [""] + for prefix in all_ensemble_prefix: + if tu.validate_for_ensemble_model( + prefix, + input_dtype, + output0_dtype, + output1_dtype, + input_shape, + input_shape, + input_shape, + ): + ensemble_prefix.append(prefix) + + if tu.validate_for_tf_model( + input_dtype, + output0_dtype, + output1_dtype, + input_shape, + output0_shape, + output1_shape, + ): + for prefix in ensemble_prefix: + for pf in ["graphdef", "savedmodel"]: + _infer_exact_helper( + self, + prefix + pf, + input_shape, + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_trt_model( + input_dtype, + output0_dtype, + output1_dtype, + input_shape, + output0_shape, + output1_shape, + ): + for prefix in ensemble_prefix: + if input_dtype == np.int8: + _infer_exact_helper( + self, + prefix + "plan", + input_shape + (1, 1), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + else: + _infer_exact_helper( + self, + prefix + "plan", + input_shape, + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_onnx_model( + input_dtype, + output0_dtype, + output1_dtype, + input_shape, + output0_shape, + output1_shape, + ): + # No basic ensemble models are created against custom models [TODO] + _infer_exact_helper( + self, + "onnx", + input_shape, + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_libtorch_model( + input_dtype, + output0_dtype, + output1_dtype, + input_shape, + output0_shape, + output1_shape, + ): + # No basic ensemble models are created against custom models [TODO] + _infer_exact_helper( + self, + "libtorch", + input_shape, + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + def test_raw_fff(self): + self._full_exact(np.float32, np.float32, np.float32, (16,), (16,), (16,)) + + def test_raw_fii(self): + self._full_exact(np.float32, np.int32, np.int32, (2, 8), (2, 8), (2, 8)) + + def test_raw_fll(self): + self._full_exact(np.float32, np.int64, np.int64, (8, 4), (8, 4), (8, 4)) + + def test_raw_fil(self): + self._full_exact( + np.float32, np.int32, np.int64, (2, 8, 2), (2, 8, 2), (2, 8, 2) + ) + + def test_raw_ffi(self): + self._full_exact(np.float32, np.float32, np.int32, (16,), (16,), (16,)) + + def test_raw_iii(self): + self._full_exact(np.int32, np.int32, np.int32, (2, 8), (2, 8), (2, 8)) + + def test_faw_iif(self): + self._full_exact( + np.int32, np.int32, np.float32, (2, 8, 2), (2, 8, 2), (2, 8, 2) + ) + + def test_raw_ooo(self): + self._full_exact( + np_dtype_string, np_dtype_string, np_dtype_string, (16,), (16,), (16,) + ) + + def test_raw_oii(self): + self._full_exact(np_dtype_string, np.int32, np.int32, (2, 8), (2, 8), (2, 8)) + + def test_raw_ooi(self): + self._full_exact( + np_dtype_string, np_dtype_string, np.int32, (8, 4), (8, 4), (8, 4) + ) + + def test_raw_oio(self): + self._full_exact( + np_dtype_string, np.int32, np_dtype_string, (2, 8, 2), (2, 8, 2), (2, 8, 2) + ) + + def test_class_fff(self): + self._full_exact( + np.float32, + np.float32, + np.float32, + (16,), + (16,), + (16,), + output0_raw=False, + output1_raw=False, + ) + + def test_class_fii(self): + self._full_exact( + np.float32, + np.int32, + np.int32, + (2, 8), + (2, 8), + (2, 8), + output0_raw=False, + output1_raw=False, + ) + + def test_class_fll(self): + self._full_exact( + np.float32, + np.int64, + np.int64, + (8, 4), + (8, 4), + (8, 4), + output0_raw=False, + output1_raw=False, + ) + + def test_class_fil(self): + self._full_exact( + np.float32, + np.int32, + np.int64, + (2, 8, 2), + (2, 8, 2), + (2, 8, 2), + output0_raw=False, + output1_raw=False, + ) + + def test_class_ffi(self): + self._full_exact( + np.float32, + np.float32, + np.int32, + (16,), + (16,), + (16,), + output0_raw=False, + output1_raw=False, + ) + + def test_class_iii(self): + self._full_exact( + np.int32, + np.int32, + np.int32, + (2, 8), + (2, 8), + (2, 8), + output0_raw=False, + output1_raw=False, + ) + + def test_class_iif(self): + self._full_exact( + np.int32, + np.int32, + np.float32, + (2, 8, 2), + (2, 8, 2), + (2, 8, 2), + output0_raw=False, + output1_raw=False, + ) + + def test_mix_ffi(self): + self._full_exact( + np.float32, + np.float32, + np.int32, + (16,), + (16,), + (16,), + output0_raw=True, + output1_raw=False, + ) + + def test_mix_iii(self): + self._full_exact( + np.int32, + np.int32, + np.int32, + (2, 8), + (2, 8), + (2, 8), + output0_raw=False, + output1_raw=True, + ) + + def test_mix_iif(self): + self._full_exact( + np.int32, + np.int32, + np.float32, + (2, 8, 2), + (2, 8, 2), + (2, 8, 2), + output0_raw=True, + output1_raw=False, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_infer_variable/test.sh b/qa/L0_infer_variable/test.sh new file mode 100755 index 0000000000..9760583b94 --- /dev/null +++ b/qa/L0_infer_variable/test.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG_BASE="./client" +INFER_TEST=infer_variable_test.py +EXPECTED_NUM_TESTS="21" +TEST_RESULT_FILE='test_results.txt' + +DATADIR=`pwd`/models + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR --exit-timeout-secs=120" +SERVER_LOG_BASE="./inference_server" +source ../common/util.sh + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG_BASE* + +RET=0 + +for TARGET in cpu gpu; do + SERVER_LOG=$SERVER_LOG_BASE.${TARGET}.log + CLIENT_LOG=$CLIENT_LOG_BASE.${TARGET}.log + + rm -fr models && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository models && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_variable_model_repository/* models/. + + create_nop_version_dir `pwd`/models + + KIND="KIND_GPU" && [[ "$TARGET" == "cpu" ]] && KIND="KIND_CPU" + # Onnx models are handled separately, see below + for FW in graphdef savedmodel onnx libtorch; do + for MC in `ls models/${FW}*/config.pbtxt`; do + echo "instance_group [ { kind: ${KIND} }]" >> $MC + done + done + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python $INFER_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_infer_zero/infer_zero_test.py b/qa/L0_infer_zero/infer_zero_test.py new file mode 100755 index 0000000000..3786c5b4a1 --- /dev/null +++ b/qa/L0_infer_zero/infer_zero_test.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + +np_dtype_string = np.dtype(object) + +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) +BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx libtorch") +VALIDATION_FNS = { + "onnx": tu.validate_for_onnx_model, + "graphdef": tu.validate_for_tf_model, + "savedmodel": tu.validate_for_tf_model, + "libtorch": tu.validate_for_libtorch_model, +} + + +class InferZeroTest(tu.TestResultCollector): + def _full_zero(self, dtype, shapes): + # 'shapes' is list of shapes, one for each input. + for backend in BACKENDS.split(" "): + # object models do not exist right now for PyTorch + if backend == "libtorch" and dtype == "object": + return + + if not VALIDATION_FNS[backend]( + dtype, dtype, dtype, shapes[0], shapes[0], shapes[0] + ): + return + + for bs in (1, 8): + batch_shapes = [ + [ + bs, + ] + + shape + for shape in shapes + ] + iu.infer_zero( + self, + backend, + bs, + dtype, + batch_shapes, + batch_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + # model that does not support batching + iu.infer_zero( + self, + f"{backend}_nobatch", + 1, + dtype, + shapes, + shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + for name in ["simple_zero", "sequence_zero", "fan_zero"]: + if tu.validate_for_ensemble_model( + name, dtype, dtype, dtype, shapes[0], shapes[0], shapes[0] + ): + # model that supports batching + for bs in (1, 8): + batch_shapes = [ + [ + bs, + ] + + shape + for shape in shapes + ] + iu.infer_zero( + self, + name, + bs, + dtype, + batch_shapes, + batch_shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + # model that does not support batching + iu.infer_zero( + self, + name + "_nobatch", + 1, + dtype, + shapes, + shapes, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) + + def test_ff1_sanity(self): + self._full_zero( + np.float32, + ( + [ + 1, + ], + ), + ) + + def test_ff1(self): + self._full_zero( + np.float32, + ( + [ + 0, + ], + ), + ) + + def test_ff3_sanity(self): + self._full_zero( + np.float32, + ( + [ + 1, + ], + [ + 2, + ], + [ + 1, + ], + ), + ) + + def test_ff3_0(self): + self._full_zero( + np.float32, + ( + [ + 0, + ], + [ + 0, + ], + [ + 0, + ], + ), + ) + + def test_ff3_1(self): + self._full_zero( + np.float32, + ( + [ + 0, + ], + [ + 0, + ], + [ + 1, + ], + ), + ) + + def test_ff3_2(self): + self._full_zero( + np.float32, + ( + [ + 0, + ], + [ + 1, + ], + [ + 0, + ], + ), + ) + + def test_ff3_3(self): + self._full_zero( + np.float32, + ( + [ + 1, + ], + [ + 0, + ], + [ + 0, + ], + ), + ) + + def test_ff3_4(self): + self._full_zero( + np.float32, + ( + [ + 1, + ], + [ + 0, + ], + [ + 1, + ], + ), + ) + + def test_hh1_sanity(self): + self._full_zero(np.float16, ([2, 2],)) + + def test_hh1_0(self): + self._full_zero(np.float16, ([1, 0],)) + + def test_hh1_1(self): + self._full_zero(np.float16, ([0, 1],)) + + def test_hh1_2(self): + self._full_zero(np.float16, ([0, 0],)) + + def test_hh3_sanity(self): + self._full_zero(np.float16, ([2, 2], [2, 2], [1, 1])) + + def test_hh3_0(self): + self._full_zero(np.float16, ([0, 0], [0, 0], [0, 0])) + + def test_hh3_1(self): + self._full_zero(np.float16, ([0, 1], [0, 1], [2, 3])) + + def test_hh3_2(self): + self._full_zero(np.float16, ([1, 0], [1, 3], [0, 1])) + + def test_hh3_3(self): + self._full_zero(np.float16, ([1, 1], [3, 0], [0, 0])) + + def test_hh3_4(self): + self._full_zero(np.float16, ([1, 1], [0, 6], [2, 2])) + + def test_oo1_sanity(self): + self._full_zero( + np_dtype_string, + ( + [ + 2, + ], + ), + ) + + def test_oo1(self): + self._full_zero( + np_dtype_string, + ( + [ + 0, + ], + ), + ) + + def test_oo3_sanity(self): + self._full_zero(np_dtype_string, ([2, 2], [2, 2], [1, 1])) + + def test_oo3_0(self): + self._full_zero(np_dtype_string, ([0, 0], [0, 0], [0, 0])) + + def test_oo3_1(self): + self._full_zero(np_dtype_string, ([0, 1], [0, 1], [2, 3])) + + def test_oo3_2(self): + self._full_zero(np_dtype_string, ([1, 0], [1, 3], [0, 1])) + + def test_oo3_3(self): + self._full_zero(np_dtype_string, ([1, 1], [3, 0], [0, 0])) + + def test_oo3_4(self): + self._full_zero(np_dtype_string, ([1, 1], [0, 6], [2, 2])) + + def test_bb1_sanity(self): + self._full_zero( + bool, + ( + [ + 10, + ], + ), + ) + + def test_bb1_0(self): + self._full_zero( + bool, + ( + [ + 0, + ], + ), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_infer_zero/test.sh b/qa/L0_infer_zero/test.sh new file mode 100755 index 0000000000..02676b2f85 --- /dev/null +++ b/qa/L0_infer_zero/test.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +CLIENT_LOG="./client.log" +INFER_TEST=infer_zero_test.py +EXPECTED_NUM_TESTS="28" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f $SERVER_LOG $CLIENT_LOG +rm -fr models && mkdir models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/* models/. && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_identity_model_repository/* models/. + +# Remove version-compatible TensorRT models, as they require version-compatibility +# mode to be turned on when starting the server. +rm -rf models/plan_compatible* + +create_nop_version_dir `pwd`/models + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# python unittest seems to swallow ImportError and still return 0 +# exit code. So need to explicitly check CLIENT_LOG to make sure +# we see some running tests +python $INFER_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_inferentia_perf_analyzer/test.sh b/qa/L0_inferentia_perf_analyzer/test.sh new file mode 100755 index 0000000000..1881e07f87 --- /dev/null +++ b/qa/L0_inferentia_perf_analyzer/test.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# First need to set up environment +if [ ${USE_TENSORFLOW} == "1" ] && [ ${USE_PYTORCH} == "1" ] ; then + echo " Unsupported test configuration. Only one of USE_TENSORFLOW and USE_PYTORCH can be set to 1." + exit 0 +elif [ ${USE_TENSORFLOW} == "1" ] ; then + echo "Setting up environment with tensorflow 1" + source ${TRITON_PATH}/python_backend/inferentia/scripts/setup.sh -t --tensorflow-version 1 +elif [ ${USE_PYTORCH} == "1" ] ; then + echo "Setting up environment with pytorch" + source ${TRITON_PATH}/python_backend/inferentia/scripts/setup.sh -p +else + echo " Unsupported test configuration. USE_TENSORFLOW flag is: ${USE_TENSORFLOW} and USE_PYTORCH flag is: ${USE_PYTORCH}. Only one of them can be set to 1." + exit 0 +fi +echo "done setting up environment" + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi + +CLIENT_LOG="./perf_analyzer.log" +PERF_ANALYZER=/opt/tritonserver/qa/clients/perf_analyzer + +OUTPUT_NO_BATCH_JSONDATAFILE=${TEST_JSON_REPO}/validation_no_batch.json +OUTPUT_BATCHED_JSONDATAFILE=${TEST_JSON_REPO}/validation_batched.json +NON_ALIGNED_OUTPUT_NO_BATCH_JSONDATAFILE=${TEST_JSON_REPO}/non_aligned_validation_no_batch.json +NON_ALIGNED_OUTPUT_BATCHED_JSONDATAFILE=${TEST_JSON_REPO}/non_aligned_validation_batched.json +WRONG_OUTPUT_NO_BATCH_JSONDATAFILE=${TEST_JSON_REPO}/wrong_validation_no_batch.json +WRONG_OUTPUT_BATCHED_JSONDATAFILE=${TEST_JSON_REPO}/wrong_validation_batched.json + +ERROR_STRING="error | Request count: 0 | : 0 infer/sec" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_LOG="./inference_server.log" +source /opt/tritonserver/qa/common/util.sh +TEST_TYPES="single multiple" +BATCHED_FLAGS="_ _batched_" +DISABLE_DEFAULT_BATCHING_FLAGS="_default_batch _no_batch" +# Helper function for clearing out existing model directories +function clear_model_dir () { + for DISABLE_DEFAULT_BATCHING_FLAG in ${DISABLE_DEFAULT_BATCHING_FLAGS}; do + for BATCHED_FLAG in ${BATCHED_FLAGS}; do + for TEST_TYPE in ${TEST_TYPES}; do + DATADIR="${TRITON_PATH}/models_${TEST_TYPE}${BATCHED_FLAG}${TEST_FRAMEWORK}${DISABLE_DEFAULT_BATCHING_FLAG}" + rm -rf DATADIR + done + done + done +} +# Helper function for generating models +function create_inferentia_models () { + for DISABLE_DEFAULT_BATCHING_FLAG in ${DISABLE_DEFAULT_BATCHING_FLAGS}; do + for BATCHED_FLAG in ${BATCHED_FLAGS}; do + for TEST_TYPE in ${TEST_TYPES}; do + CURR_GEN_SCRIPT="${GEN_SCRIPT} --model_type ${MODEL_TYPE} + --triton_model_dir ${TRITON_PATH}/models_${TEST_TYPE}${BATCHED_FLAG}${TEST_FRAMEWORK}${DISABLE_DEFAULT_BATCHING_FLAG}/add-sub-1x4 + --compiled_model ${COMPILED_MODEL}" + if [ ${DISABLE_DEFAULT_BATCHING_FLAG} == "_no_batch" ]; then + CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} + --disable_batch_requests_to_neuron" + fi + if [ ${BATCHED_FLAG} == "_batched_" ]; then + CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} + --triton_input INPUT__0,INT64,4 INPUT__1,INT64,4 + --triton_output OUTPUT__0,INT64,4 OUTPUT__1,INT64,4 + --enable_dynamic_batching + --max_batch_size 1000 + --preferred_batch_size 8 + --max_queue_delay_microseconds 100" + else + CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} + --triton_input INPUT__0,INT64,-1x4 INPUT__1,INT64,-1x4 + --triton_output OUTPUT__0,INT64,-1x4 OUTPUT__1,INT64,-1x4" + fi + if [ ${TEST_TYPE} == "single" ]; then + CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} + --neuron_core_range 0:0" + elif [ ${TEST_TYPE} == "multiple" ]; then + CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} + --triton_model_instance_count 3 + --neuron_core_range 0:7" + fi + echo ${CURR_GEN_SCRIPT} + eval ${CURR_GEN_SCRIPT} + done + done + done +} + +# Setup models +if [ ${USE_TENSORFLOW} == "1" ]; then + TEST_FRAMEWORK="tf1" + clear_model_dir + python ${TEST_JSON_REPO}/simple_model.py \ + --name add_sub_model_tf1 \ + --model_type tensorflow \ + --tf_version 1 \ + --batch_size 1 + GEN_SCRIPT="python ${TRITON_PATH}/python_backend/inferentia/scripts/gen_triton_model.py" + MODEL_TYPE="tensorflow" + COMPILED_MODEL="${PWD}/add_sub_model_tf1" + create_inferentia_models + +elif [ ${USE_PYTORCH} == "1" ]; then + TEST_FRAMEWORK="pyt" + clear_model_dir + python ${TEST_JSON_REPO}/simple_model.py \ + --name add_sub_model_pyt \ + --model_type pytorch \ + --batch_size 1 + GEN_SCRIPT="python ${TRITON_PATH}/python_backend/inferentia/scripts/gen_triton_model.py" + MODEL_TYPE="pytorch" + COMPILED_MODEL="$PWD/add_sub_model_pyt.pt" + create_inferentia_models +fi + + +RET=0 +for DISABLE_DEFAULT_BATCHING_FLAG in ${DISABLE_DEFAULT_BATCHING_FLAGS}; do + for BATCHED_FLAG in ${BATCHED_FLAGS}; do + for TEST_TYPE in $TEST_TYPES; do + DATADIR="${TRITON_PATH}/models_${TEST_TYPE}${BATCHED_FLAG}${TEST_FRAMEWORK}${DISABLE_DEFAULT_BATCHING_FLAG}" + SERVER_ARGS="--model-repository=${DATADIR} --log-verbose=1" + PERF_ANALYZER_EXTRA_ARGS="" + if [ ${BATCHED_FLAG} == "_batched_" ]; then + PERF_ANALYZER_EXTRA_ARGS="-b 6" + NON_ALIGNED_OUTPUT_JSONDATAFILE=${NON_ALIGNED_OUTPUT_BATCHED_JSONDATAFILE} + WRONG_OUTPUT_JSONDATAFILE=${WRONG_OUTPUT_BATCHED_JSONDATAFILE} + OUTPUT_JSONDATAFILE=${OUTPUT_BATCHED_JSONDATAFILE} + else + PERF_ANALYZER_EXTRA_ARGS="" + NON_ALIGNED_OUTPUT_JSONDATAFILE=${NON_ALIGNED_OUTPUT_NO_BATCH_JSONDATAFILE} + WRONG_OUTPUT_JSONDATAFILE=${WRONG_OUTPUT_NO_BATCH_JSONDATAFILE} + OUTPUT_JSONDATAFILE=${OUTPUT_NO_BATCH_JSONDATAFILE} + fi + rm -f $SERVER_LOG $CLIENT_LOG + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + set +e + $PERF_ANALYZER -v -m add-sub-1x4 --concurrency-range 1:10:4 --input-data=${NON_ALIGNED_OUTPUT_JSONDATAFILE} ${PERF_ANALYZER_EXTRA_ARGS} >$CLIENT_LOG 2>&1 + if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "The 'validation_data' field doesn't align with 'data' field in the json file" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -m add-sub-1x4 --concurrency-range 1:10:4 --input-data=${WRONG_OUTPUT_JSONDATAFILE} ${PERF_ANALYZER_EXTRA_ARGS} >$CLIENT_LOG 2>&1 + if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "Output doesn't match expected output" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -m add-sub-1x4 --concurrency-range 1:10:4 --input-data=${OUTPUT_JSONDATAFILE} ${PERF_ANALYZER_EXTRA_ARGS} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + kill_server + done + done +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_input_validation/input_validation_test.py b/qa/L0_input_validation/input_validation_test.py new file mode 100755 index 0000000000..8e7f58bb0c --- /dev/null +++ b/qa/L0_input_validation/input_validation_test.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import tritonclient.grpc as tritongrpcclient +import tritonclient.utils.shared_memory as shm +from tritonclient.utils import InferenceServerException, np_to_triton_dtype + + +class InputValTest(unittest.TestCase): + def test_input_validation_required_empty(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name="input_all_required", + inputs=inputs, + ) + err_str = str(e.exception) + self.assertIn( + "expected 3 inputs but got 0 inputs for model 'input_all_required'. Got input(s) [], but missing required input(s) ['INPUT0','INPUT1','INPUT2']. Please provide all required input(s).", + err_str, + ) + + def test_input_validation_optional_empty(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name="input_optional", + inputs=inputs, + ) + err_str = str(e.exception) + self.assertIn( + "expected number of inputs between 3 and 4 but got 0 inputs for model 'input_optional'. Got input(s) [], but missing required input(s) ['INPUT0','INPUT1','INPUT2']. Please provide all required input(s).", + err_str, + ) + + def test_input_validation_required_missing(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + inputs.append(tritongrpcclient.InferInput("INPUT0", [1], "FP32")) + + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.float32)) + + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name="input_all_required", + inputs=inputs, + ) + err_str = str(e.exception) + self.assertIn( + "expected 3 inputs but got 1 inputs for model 'input_all_required'. Got input(s) ['INPUT0'], but missing required input(s) ['INPUT1','INPUT2']. Please provide all required input(s).", + err_str, + ) + + def test_input_validation_optional(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + inputs.append(tritongrpcclient.InferInput("INPUT0", [1], "FP32")) + # Option Input is added, 2 required are missing + + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.float32)) + + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name="input_optional", + inputs=inputs, + ) + err_str = str(e.exception) + self.assertIn( + "expected number of inputs between 3 and 4 but got 1 inputs for model 'input_optional'. Got input(s) ['INPUT0'], but missing required input(s) ['INPUT1','INPUT2']. Please provide all required input(s).", + err_str, + ) + + def test_input_validation_all_optional(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + result = triton_client.infer( + model_name="input_all_optional", + inputs=inputs, + ) + response = result.get_response() + self.assertIn(str(response.outputs[0].name), "OUTPUT0") + + +class InputShapeTest(unittest.TestCase): + def test_input_shape_validation(self): + input_size = 8 + model_name = "pt_identity" + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + + # Pass + input_data = np.arange(input_size)[None].astype(np.float32) + inputs = [ + tritongrpcclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.infer(model_name=model_name, inputs=inputs) + + # Larger input byte size than expected + input_data = np.arange(input_size + 2)[None].astype(np.float32) + inputs = [ + tritongrpcclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + # Compromised input shape + inputs[0].set_shape((1, input_size)) + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name=model_name, + inputs=inputs, + ) + err_str = str(e.exception) + self.assertIn( + "input byte size mismatch for input 'INPUT0' for model 'pt_identity'. Expected 32, got 40", + err_str, + ) + + def test_input_string_shape_validation(self): + input_size = 16 + model_name = "graphdef_object_int32_int32" + np_dtype_string = np.dtype(object) + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + + def get_input_array(input_size, np_dtype): + rinput_dtype = iu._range_repr_dtype(np_dtype) + input_array = np.random.randint( + low=0, high=127, size=(1, input_size), dtype=rinput_dtype + ) + + # Convert to string type + inn = np.array( + [str(x) for x in input_array.reshape(input_array.size)], dtype=object + ) + input_array = inn.reshape(input_array.shape) + + inputs = [] + inputs.append( + tritongrpcclient.InferInput( + "INPUT0", input_array.shape, np_to_triton_dtype(np_dtype) + ) + ) + inputs.append( + tritongrpcclient.InferInput( + "INPUT1", input_array.shape, np_to_triton_dtype(np_dtype) + ) + ) + + inputs[0].set_data_from_numpy(input_array) + inputs[1].set_data_from_numpy(input_array) + return inputs + + # Input size is less than expected + inputs = get_input_array(input_size - 2, np_dtype_string) + # Compromised input shape + inputs[0].set_shape((1, input_size)) + inputs[1].set_shape((1, input_size)) + with self.assertRaises(InferenceServerException) as e: + triton_client.infer(model_name=model_name, inputs=inputs) + err_str = str(e.exception) + self.assertIn( + f"expected {input_size} string elements for inference input 'INPUT1' for model '{model_name}', got {input_size-2}", + err_str, + ) + + # Input size is greater than expected + inputs = get_input_array(input_size + 2, np_dtype_string) + # Compromised input shape + inputs[0].set_shape((1, input_size)) + inputs[1].set_shape((1, input_size)) + with self.assertRaises(InferenceServerException) as e: + triton_client.infer(model_name=model_name, inputs=inputs) + err_str = str(e.exception) + self.assertIn( + f"unexpected number of string elements {input_size+1} for inference input 'INPUT1' for model '{model_name}', expecting {input_size}", + err_str, + ) + + def test_wrong_input_shape_tensor_size(self): + def inference_helper(model_name, batch_size=1): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + if batch_size > 1: + dummy_input_data = np.random.rand(batch_size, 32, 32).astype(np.float32) + else: + dummy_input_data = np.random.rand(32, 32).astype(np.float32) + shape_tensor_data = np.asarray([4, 4], dtype=np.int32) + + # Pass incorrect input byte size date for shape tensor + # Use shared memory to bypass the shape check in client library + input_byte_size = (shape_tensor_data.size - 1) * np.dtype(np.int32).itemsize + + input_shm_handle = shm.create_shared_memory_region( + "INPUT0_SHM", + "/INPUT0_SHM", + input_byte_size, + ) + shm.set_shared_memory_region( + input_shm_handle, + [ + shape_tensor_data, + ], + ) + triton_client.register_system_shared_memory( + "INPUT0_SHM", + "/INPUT0_SHM", + input_byte_size, + ) + + inputs = [ + tritongrpcclient.InferInput( + "DUMMY_INPUT0", + dummy_input_data.shape, + np_to_triton_dtype(np.float32), + ), + tritongrpcclient.InferInput( + "INPUT0", + shape_tensor_data.shape, + np_to_triton_dtype(np.int32), + ), + ] + inputs[0].set_data_from_numpy(dummy_input_data) + inputs[1].set_shared_memory("INPUT0_SHM", input_byte_size) + + outputs = [ + tritongrpcclient.InferRequestedOutput("DUMMY_OUTPUT0"), + tritongrpcclient.InferRequestedOutput("OUTPUT0"), + ] + + try: + # Perform inference + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + err_str = str(e.exception) + correct_input_byte_size = ( + shape_tensor_data.size * np.dtype(np.int32).itemsize + ) + self.assertIn( + f"input byte size mismatch for input 'INPUT0' for model '{model_name}'. Expected {correct_input_byte_size}, got {input_byte_size}", + err_str, + ) + finally: + shm.destroy_shared_memory_region(input_shm_handle) + triton_client.unregister_system_shared_memory("INPUT0_SHM") + + inference_helper(model_name="plan_nobatch_zero_1_float32_int32") + inference_helper(model_name="plan_zero_1_float32_int32", batch_size=8) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_input_validation/models/input_all_optional/1/model.py b/qa/L0_input_validation/models/input_all_optional/1/model.py new file mode 100644 index 0000000000..40f8b25579 --- /dev/null +++ b/qa/L0_input_validation/models/input_all_optional/1/model.py @@ -0,0 +1,47 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for _ in requests: + # Include one of each specially parsed JSON value: nan, inf, and -inf + out_0 = np.array([1], dtype=np.float32) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + + return responses diff --git a/qa/L0_input_validation/models/input_all_optional/config.pbtxt b/qa/L0_input_validation/models/input_all_optional/config.pbtxt new file mode 100644 index 0000000000..24e8259070 --- /dev/null +++ b/qa/L0_input_validation/models/input_all_optional/config.pbtxt @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "input_all_optional" +backend: "python" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_input_validation/models/input_all_required/1/model.py b/qa/L0_input_validation/models/input_all_required/1/model.py new file mode 100644 index 0000000000..40f8b25579 --- /dev/null +++ b/qa/L0_input_validation/models/input_all_required/1/model.py @@ -0,0 +1,47 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for _ in requests: + # Include one of each specially parsed JSON value: nan, inf, and -inf + out_0 = np.array([1], dtype=np.float32) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + + return responses diff --git a/qa/L0_input_validation/models/input_all_required/config.pbtxt b/qa/L0_input_validation/models/input_all_required/config.pbtxt new file mode 100644 index 0000000000..e1268ff210 --- /dev/null +++ b/qa/L0_input_validation/models/input_all_required/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "input_all_required" +backend: "python" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_input_validation/models/input_optional/1/model.py b/qa/L0_input_validation/models/input_optional/1/model.py new file mode 100644 index 0000000000..40f8b25579 --- /dev/null +++ b/qa/L0_input_validation/models/input_optional/1/model.py @@ -0,0 +1,47 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for _ in requests: + # Include one of each specially parsed JSON value: nan, inf, and -inf + out_0 = np.array([1], dtype=np.float32) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + + return responses diff --git a/qa/L0_input_validation/models/input_optional/config.pbtxt b/qa/L0_input_validation/models/input_optional/config.pbtxt new file mode 100644 index 0000000000..2fe7183a16 --- /dev/null +++ b/qa/L0_input_validation/models/input_optional/config.pbtxt @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "input_optional" +backend: "python" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "INPUT3" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_input_validation/test.sh b/qa/L0_input_validation/test.sh new file mode 100755 index 0000000000..22e0560959 --- /dev/null +++ b/qa/L0_input_validation/test.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION} +SERVER=/opt/tritonserver/bin/tritonserver +CLIENT_LOG="./input_validation_client.log" +TEST_PY=./input_validation_test.py +TEST_RESULT_FILE='./test_results.txt' +SERVER_LOG="./inference_server.log" +TEST_LOG="./input_byte_size_test.log" +TEST_EXEC=./input_byte_size_test + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +# input_validation_test +SERVER_ARGS="--model-repository=`pwd`/models" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 -m pytest --junitxml="input_validation.report.xml" $TEST_PY::InputValTest >> $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** input_validation_test.py::InputValTest FAILED. \n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# input_shape_validation_test +pip install torch +pip install pytest-asyncio + +mkdir -p models/pt_identity/1 +PYTHON_CODE=$(cat < models/pt_identity/config.pbtxt << EOL +name: "pt_identity" +backend: "pytorch" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [8] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [8] + } +] +# ensure we batch requests together +dynamic_batching { + max_queue_delay_microseconds: 1000000 +} +EOL + +cp -r $DATADIR/qa_model_repository/graphdef_object_int32_int32 models/. +cp -r $DATADIR/qa_shapetensor_model_repository/plan_nobatch_zero_1_float32_int32 models/. +cp -r $DATADIR/qa_shapetensor_model_repository/plan_zero_1_float32_int32 models/. + +SERVER_ARGS="--model-repository=`pwd`/models" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 -m pytest --junitxml="input_shape_validation.report.xml" $TEST_PY::InputShapeTest >> $CLIENT_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** input_validation_test.py::InputShapeTest FAILED. \n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# input_byte_size_test +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/{savedmodel_zero_1_float32,savedmodel_zero_1_object} ./models + +set +e +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >> $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $TEST_LOG + echo -e "\n***\n*** input_byte_size_test FAILED\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Input Validation Test Passed\n***" +else + echo -e "\n***\n*** Input Validation Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh new file mode 100755 index 0000000000..84ab4fb0c0 --- /dev/null +++ b/qa/L0_io/test.sh @@ -0,0 +1,266 @@ +#!/bin/bash +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +IO_TEST_UTIL=./memory_alloc +CLIENT_LOG="./client.log" +MODELSDIR=`pwd`/models + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +ENSEMBLEDIR=/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository + +# Must explicitly set LD_LIBRARY_PATH so that IO_TEST_UTIL can find +# libtritonserver.so. +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH + +rm -f $CLIENT_LOG* + +# PyTorch is required for the Python backend dlpack add sub models +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html +RET=0 + +# Prepare float32 models with basic config +rm -rf $MODELSDIR + +for trial in graphdef savedmodel onnx libtorch plan python python_dlpack; do + full=${trial}_float32_float32_float32 + if [ "$trial" == "python" ]; then + mkdir -p $MODELSDIR/${full}/1 && \ + cp ../python_models/add_sub/model.py $MODELSDIR/${full}/1/. && \ + cp ../python_models/add_sub/config.pbtxt $MODELSDIR/${full}/. && \ + (cd $MODELSDIR/${full} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + + # ensemble version of the model. + mkdir -p $MODELSDIR/fan_${full}/1 && \ + cp ../python_models/add_sub/model.py $MODELSDIR/fan_${full}/1/. && \ + cp ../python_models/fan_add_sub/config.pbtxt $MODELSDIR/fan_${full}/. && \ + (cd $MODELSDIR/fan_${full} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + sed -i "s/model_name: \"ENSEMBLE_MODEL_NAME\"/model_name: \"${full}\"/" config.pbtxt && \ + sed -i "0,/name:.*/{s/name:.*/name: \"fan_${full}\"/}" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + continue + fi + + if [ "$trial" == "python_dlpack" ]; then + mkdir -p $MODELSDIR/${full}/1 && \ + cp ../python_models/dlpack_add_sub/model.py $MODELSDIR/${full}/1/. && \ + cp ../python_models/dlpack_add_sub/config.pbtxt $MODELSDIR/${full}/. && \ + (cd $MODELSDIR/${full} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + sed -i "0,/name:.*/{s/name:.*/name: \"${full}\"/}" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + + # ensemble version of the model. + mkdir -p $MODELSDIR/fan_${full}/1 && \ + cp ../python_models/dlpack_add_sub/model.py $MODELSDIR/fan_${full}/1/. && \ + cp ../python_models/fan_add_sub/config.pbtxt $MODELSDIR/fan_${full}/. && \ + (cd $MODELSDIR/fan_${full} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + sed -i "s/model_name: \"ENSEMBLE_MODEL_NAME\"/model_name: \"${full}\"/" config.pbtxt && \ + sed -i "0,/name:.*/{s/name:.*/name: \"fan_${full}\"/}" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + continue + fi + + mkdir -p $MODELSDIR/${full}/1 && \ + cp -r $DATADIR/${full}/1/* $MODELSDIR/${full}/1/. && \ + cp $DATADIR/${full}/config.pbtxt $MODELSDIR/${full}/. && \ + (cd $MODELSDIR/${full} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + echo "instance_group [{ kind: KIND_CPU }]" >> config.pbtxt) + + # ensemble version of the model. + mkdir -p $MODELSDIR/fan_${full}/1 && \ + cp $ENSEMBLEDIR/fan_${full}/config.pbtxt $MODELSDIR/fan_${full}/. && \ + (cd $MODELSDIR/fan_${full} && \ + sed -i "s/label_filename:.*//" config.pbtxt) + + if [ "$trial" == "libtorch" ]; then + (cd $MODELSDIR/fan_${full} && \ + sed -i -e '{ + N + s/key: "OUTPUT\([0-9]\)"\n\(.*\)value: "same_output/key: "OUTPUT__\1"\n\2value: "same_output/ + }' config.pbtxt) + fi +done + +# Prepare string models with basic config +for trial in graphdef savedmodel onnx ; do + full=${trial}_object_object_object + mkdir -p $MODELSDIR/${full}/1 && \ + cp -r $DATADIR/${full}/1/* $MODELSDIR/${full}/1/. && \ + cp $DATADIR/${full}/config.pbtxt $MODELSDIR/${full}/. && \ + (cd $MODELSDIR/${full} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + echo "instance_group [{ kind: KIND_CPU }]" >> config.pbtxt) +done + +# set up "addsub" ensemble for custom float32 model +cp -r $MODELSDIR/fan_graphdef_float32_float32_float32 $MODELSDIR/fan_${full} && \ + (cd $MODELSDIR/fan_${full} && \ + sed -i "s/graphdef_float32_float32_float32/${full}/" config.pbtxt) + +# custom float32 component of ensemble +cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \ + mkdir -p $MODELSDIR/nop_TYPE_FP32_-1/1 + +# prepare libtorch multi-device and multi-gpu models +cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/. +cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py +mkdir -p $MODELSDIR/libtorch_multi_device/1 +mkdir -p $MODELSDIR/libtorch_multi_gpu/1 +cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/. +(cd $MODELSDIR/libtorch_multi_gpu && \ + sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) + +set +e +python3 gen_libtorch_model.py >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Error when generating libtorch models. \n***" + cat $CLIENT_LOG + exit 1 +fi +set -e + +TRIALS="graphdef savedmodel onnx libtorch plan python python_dlpack libtorch_multi_gpu libtorch_multi_device" +for input_device in -1 0 1; do + for output_device in -1 0 1; do + for trial in ${TRIALS}; do + # TensorRT Plan should only be deployed on GPU device + model_devices="-1 0 1" && [[ "$trial" == "plan" ]] && model_devices="0 1" + full=${trial}_float32_float32_float32 && [[ "$trial" == "libtorch_multi"* ]] && full=${trial} + + for model_device in $model_devices; do + full_log=$CLIENT_LOG.$full.$input_device.$output_device.$model_device + + host_policy=cpu + if [ "$model_device" == "-1" ]; then + if [[ "$trial" != "libtorch_multi"* ]]; then + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_CPU }]/" config.pbtxt) + fi + else + host_policy=gpu_${model_device} + if [[ "$trial" != "libtorch_multi"* ]]; then + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_GPU, gpus: [${model_device}] }]/" config.pbtxt) + fi + fi + + set +e + $IO_TEST_UTIL -i $input_device -o $output_device -r $MODELSDIR -m $full >>$full_log 2>&1 + if [ $? -ne 0 ]; then + cat $full_log + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + # Test with host policy + set +e + $IO_TEST_UTIL -i $input_device -o $output_device -h $host_policy -r $MODELSDIR -m $full >>$full_log 2>&1 + # FIXME currently only apply the new changes to ORT backend, should apply to others + if [[ "$trial" == "onnx" ]]; then + if [ $? -ne 0 ]; then + cat $full_log + echo -e "\n***\n*** Test Failed. Expect passing \n***" + RET=1 + fi + else + if [ $? -eq 0 ]; then + cat $full_log + echo -e "\n***\n*** Test Failed. Expect failure \n***" + RET=1 + fi + fi + set -e + + # ensemble + if [[ "$trial" != "libtorch_multi"* ]]; then + set +e + $IO_TEST_UTIL -i $input_device -o $output_device -r $MODELSDIR -m fan_$full >>$full_log.ensemble 2>&1 + if [ $? -ne 0 ]; then + cat $full_log.ensemble + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + fi + done + done + + for trial in graphdef savedmodel onnx; do + model_devices="-1 0 1" + for model_device in $model_devices; do + full=${trial}_object_object_object + full_log=$CLIENT_LOG.$full.$input_device.$output_device.$model_device + + if [ "$model_device" == "-1" ]; then + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_CPU }]/" config.pbtxt) + else + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_GPU, gpus: [${model_device}] }]/" config.pbtxt) + fi + + set +e + $IO_TEST_UTIL -i $input_device -o $output_device -r $MODELSDIR -m $full >>$full_log 2>&1 + if [ $? -ne 0 ]; then + cat $full_log + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + done + done +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_iterative_sequence/iterative_sequence_e2e.py b/qa/L0_iterative_sequence/iterative_sequence_e2e.py new file mode 100755 index 0000000000..3676a2f6b3 --- /dev/null +++ b/qa/L0_iterative_sequence/iterative_sequence_e2e.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json + +# GRPC streaming helpers.. +import queue +import unittest +from functools import partial + +import numpy as np +import requests +import sseclient +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + +MODEL_CONFIG_BASE = """ +{{ +"backend": "iterative_sequence", +"max_batch_size": 1, +"input" : [ + {{ + "name": "INPUT", + "data_type": "TYPE_INT32", + "dims": [ 1 ] + }} +], +"output" : [ + {{ + "name": "OUTPUT", + "data_type": "TYPE_INT32", + "dims": [ 1 ] + }} +], +"model_transaction_policy" : {{ + "decoupled": true +}}, +{}, +"instance_group" : [{{ "kind": "KIND_CPU" }}] +}} +""" + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class IterativeSequenceTest(tu.TestResultCollector): + def setUp(self): + # Always make sure the original config is used + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model("iterative_sequence") + + def test_generate_stream(self): + headers = {"Accept": "text/event-stream"} + url = "http://localhost:8000/v2/models/iterative_sequence/generate_stream" + inputs = {"INPUT": 2} + res = requests.post(url, data=json.dumps(inputs), headers=headers) + res.raise_for_status() + client = sseclient.SSEClient(res) + res_count = 2 + for event in client.events(): + res_count -= 1 + data = json.loads(event.data) + self.assertIn("OUTPUT", data) + self.assertEqual(res_count, data["OUTPUT"]) + self.assertEqual(0, res_count) + + def test_grpc_stream( + self, sequence_id=0, sequence_start=False, num_requests=1, validation=True + ): + user_data = UserData() + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.start_stream(callback=partial(callback, user_data)) + inputs = [] + inputs.append(grpcclient.InferInput("INPUT", [1, 1], "INT32")) + inputs[0].set_data_from_numpy(np.array([[2]], dtype=np.int32)) + + for _ in range(num_requests): + triton_client.async_stream_infer( + model_name="iterative_sequence", + inputs=inputs, + sequence_id=sequence_id, + sequence_start=sequence_start, + ) + res_count = 2 * num_requests + while res_count > 0: + data_item = user_data._completed_requests.get() + res_count -= 1 + if type(data_item) == InferenceServerException: + raise data_item + else: + if validation: + self.assertEqual( + res_count % 2, data_item.as_numpy("OUTPUT")[0][0] + ) + self.assertEqual(0, res_count) + + def test_backlog_fill(self): + config = r'"sequence_batching" : { "iterative_sequence" : true, "max_sequence_idle_microseconds": 8000000, direct: { "max_queue_delay_microseconds" : 10000000 }}' + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model( + "iterative_sequence", config=MODEL_CONFIG_BASE.format(config) + ) + self.test_grpc_stream(num_requests=4, validation=False) + + def test_reschedule_error(self): + # Use short idle timeout (< backend reschedule delay: 0.5s) so that + # the backend won't be able to reschedule the request as the scheduler + # will terminate the sequence early + config = r'"sequence_batching" : { "iterative_sequence" : true, "max_sequence_idle_microseconds" : 200000 }' + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model( + "iterative_sequence", config=MODEL_CONFIG_BASE.format(config) + ) + with self.assertRaises(InferenceServerException) as context: + # Without specifying 'iterative_sequence : true', the sequence + # batcher expects sequence parameters to be provided explicitly + self.test_grpc_stream() + print(str(context.exception)) + self.assertTrue( + "must specify the START flag on the first request of the sequence" + in str(context.exception) + ) + + def test_unsupported_sequence_scheduler(self): + # Override model config with scheduler settings that do not support + # request rescheduling. + configs = [ + r'"sequence_batching" : { "direct" : {}, "iterative_sequence" : false }', + r'"sequence_batching" : { "oldest" : {}, "iterative_sequence" : false }', + ] + sid = 1 + for sc in configs: + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model( + "iterative_sequence", config=MODEL_CONFIG_BASE.format(sc) + ) + with self.assertRaises(InferenceServerException) as context: + # Without specifying 'iterative_sequence : true', the sequence + # batcher expects sequence parameters to be provided explicitly + self.test_grpc_stream(sequence_id=sid, sequence_start=True) + sid += 1 + self.assertTrue( + "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE" + in str(context.exception) + ) + + def test_unsupported_dynamic_scheduler(self): + # Override model config with scheduler settings that do not support + # request rescheduling. + configs = [ + r'"dynamic_batching" : {}', + ] + for sc in configs: + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: + triton_client.load_model( + "iterative_sequence", config=MODEL_CONFIG_BASE.format(sc) + ) + with self.assertRaises(InferenceServerException) as context: + self.test_grpc_stream() + self.assertTrue( + "Request is released with TRITONSERVER_REQUEST_RELEASE_RESCHEDULE" + in str(context.exception) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_iterative_sequence/models/iterative_sequence/config.pbtxt b/qa/L0_iterative_sequence/models/iterative_sequence/config.pbtxt new file mode 100644 index 0000000000..fbf8685291 --- /dev/null +++ b/qa/L0_iterative_sequence/models/iterative_sequence/config.pbtxt @@ -0,0 +1,48 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +backend: "iterative_sequence" +max_batch_size: 1 +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +model_transaction_policy { + decoupled: True +} +sequence_batching { + iterative_sequence : true +} +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_iterative_sequence/test.sh b/qa/L0_iterative_sequence/test.sh new file mode 100755 index 0000000000..faf1cff084 --- /dev/null +++ b/qa/L0_iterative_sequence/test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +CLIENT_LOG="./iterative_sequence_client.log" +TEST_PY=./iterative_sequence_e2e.py +EXPECTED_NUM_TESTS="6" +TEST_RESULT_FILE='test_results.txt' + + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +pip install sseclient-py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=EXPLICIT" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST_PY >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_java_memory_growth/MemoryGrowthTest.java b/qa/L0_java_memory_growth/MemoryGrowthTest.java new file mode 100644 index 0000000000..28243459ec --- /dev/null +++ b/qa/L0_java_memory_growth/MemoryGrowthTest.java @@ -0,0 +1,943 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import static org.bytedeco.tritonserver.global.tritonserver.*; + +import com.google.gson.*; +import java.io.*; +import java.util.*; +import java.util.concurrent.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.tritonserver.tritonserver.*; + +public class MemoryGrowthTest { + static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0; + private static boolean done = false; + static float max_growth_allowed = .10f; + static int max_mem_allowed = 30; + + static void FAIL(String MSG) + { + System.err.println("failure: " + MSG); + System.exit(1); + } + + static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) + { + if (err__ != null) { + System.err.println( + "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - " + + TRITONSERVER_ErrorMessage(err__)); + TRITONSERVER_ErrorDelete(err__); + System.exit(1); + } + } + + static boolean enforce_memory_type = false; + static int requested_memory_type; + // Parameters for percentile range to include (exclude outliers) + static final int max_percentile = 90; + static final int min_percentile = 10; + + static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server { + public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) + { + super(p); + deallocator(new DeleteDeallocator(this)); + } + protected static class DeleteDeallocator + extends TRITONSERVER_Server implements Deallocator { + DeleteDeallocator(Pointer p) { super(p); } + @Override public void deallocate() { TRITONSERVER_ServerDelete(this); } + } + } + + static void Usage(String msg) + { + if (msg != null) { + System.err.println(msg); + } + + System.err.println( + "Usage: java " + MemoryGrowthTest.class.getSimpleName() + " [options]"); + System.err.println("\t-i Set number of iterations"); + System.err.println( + "\t-m <\"system\"|\"pinned\"|gpu>" + + " Enforce the memory type for input and output tensors." + + " If not specified, inputs will be in system memory and outputs" + + " will be based on the model's preferred type."); + System.err.println("\t-v Enable verbose logging"); + System.err.println("\t-r [model repository absolute path]"); + System.err.println( + "\t--max-growth Specify maximum allowed memory growth (%)"); + System.err.println("\t--max-memory Specify maximum allowed memory (MB)"); + + System.exit(1); + } + + static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t { + @Override + public TRITONSERVER_Error call( + TRITONSERVER_ResponseAllocator allocator, String tensor_name, + long byte_size, int preferred_memory_type, + long preferred_memory_type_id, Pointer userp, PointerPointer buffer, + PointerPointer buffer_userp, IntPointer actual_memory_type, + LongPointer actual_memory_type_id) + { + // Initially attempt to make the actual memory type and id that we + // allocate be the same as preferred memory type + actual_memory_type.put(0, preferred_memory_type); + actual_memory_type_id.put(0, preferred_memory_type_id); + + // If 'byte_size' is zero just return 'buffer' == nullptr, we don't + // need to do any other book-keeping. + if (byte_size == 0) { + buffer.put(0, null); + buffer_userp.put(0, null); + } else { + Pointer allocated_ptr = new Pointer(); + if (enforce_memory_type) { + actual_memory_type.put(0, requested_memory_type); + } + + actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU); + allocated_ptr = Pointer.malloc(byte_size); + + // Pass the tensor name with buffer_userp so we can show it when + // releasing the buffer. + if (!allocated_ptr.isNull()) { + buffer.put(0, allocated_ptr); + buffer_userp.put(0, Loader.newGlobalRef(tensor_name)); + } + } + + return null; // Success + } + } + + static class ResponseRelease + extends TRITONSERVER_ResponseAllocatorReleaseFn_t { + @Override + public TRITONSERVER_Error call( + TRITONSERVER_ResponseAllocator allocator, Pointer buffer, + Pointer buffer_userp, long byte_size, int memory_type, + long memory_type_id) + { + String name = null; + if (buffer_userp != null) { + name = (String) Loader.accessGlobalRef(buffer_userp); + } else { + name = ""; + } + Pointer.free(buffer); + Loader.deleteGlobalRef(buffer_userp); + + return null; // Success + } + } + + static class InferRequestComplete + extends TRITONSERVER_InferenceRequestReleaseFn_t { + @Override + public void call( + TRITONSERVER_InferenceRequest request, int flags, Pointer userp) + { + // We reuse the request so we don't delete it here. + } + } + + static class InferResponseComplete + extends TRITONSERVER_InferenceResponseCompleteFn_t { + @Override + public void call( + TRITONSERVER_InferenceResponse response, int flags, Pointer userp) + { + if (response != null) { + // Send 'response' to the future. + futures.get(userp).complete(response); + } + } + } + + static ConcurrentHashMap< + Pointer, CompletableFuture> futures = + new ConcurrentHashMap<>(); + static ResponseAlloc responseAlloc = new ResponseAlloc(); + static ResponseRelease responseRelease = new ResponseRelease(); + static InferRequestComplete inferRequestComplete = new InferRequestComplete(); + static InferResponseComplete inferResponseComplete = + new InferResponseComplete(); + + static TRITONSERVER_Error ParseModelMetadata( + JsonObject model_metadata, boolean[] is_int, boolean[] is_torch_model) + { + String seen_data_type = null; + for (JsonElement input_element : + model_metadata.get("inputs").getAsJsonArray()) { + JsonObject input = input_element.getAsJsonObject(); + if (!input.get("datatype").getAsString().equals("INT32") + && !input.get("datatype").getAsString().equals("FP32")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "simple lib example only supports model with data type INT32 or " + + "FP32"); + } + if (seen_data_type == null) { + seen_data_type = input.get("datatype").getAsString(); + } else if (!seen_data_type.equals(input.get("datatype").getAsString())) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "the inputs and outputs of 'simple' model must have the data type"); + } + } + for (JsonElement output_element : + model_metadata.get("outputs").getAsJsonArray()) { + JsonObject output = output_element.getAsJsonObject(); + if (!output.get("datatype").getAsString().equals("INT32") + && !output.get("datatype").getAsString().equals("FP32")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "simple lib example only supports model with data type INT32 or " + + "FP32"); + } else if (!seen_data_type.equals(output.get("datatype").getAsString())) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "the inputs and outputs of 'simple' model must have the data type"); + } + } + + is_int[0] = seen_data_type.equals("INT32"); + is_torch_model[0] = + model_metadata.get("platform").getAsString().equals("pytorch_libtorch"); + return null; + } + + static void GenerateInputData( + IntPointer[] input0_data, IntPointer[] input1_data) + { + input0_data[0] = new IntPointer(16); + input1_data[0] = new IntPointer(16); + for (int i = 0; i < 16; ++i) { + input0_data[0].put(i, i); + input1_data[0].put(i, 1); + } + } + + static void GenerateInputData( + FloatPointer[] input0_data, FloatPointer[] input1_data) + { + input0_data[0] = new FloatPointer(16); + input1_data[0] = new FloatPointer(16); + for (int i = 0; i < 16; ++i) { + input0_data[0].put(i, i); + input1_data[0].put(i, 1); + } + } + + static void CompareResult( + String output0_name, String output1_name, IntPointer input0, + IntPointer input1, IntPointer output0, IntPointer output1) + { + for (int i = 0; i < 16; ++i) { + if ((input0.get(i) + input1.get(i)) != output0.get(i)) { + FAIL("incorrect sum in " + output0_name); + } + if ((input0.get(i) - input1.get(i)) != output1.get(i)) { + FAIL("incorrect difference in " + output1_name); + } + } + } + + static void CompareResult( + String output0_name, String output1_name, FloatPointer input0, + FloatPointer input1, FloatPointer output0, FloatPointer output1) + { + for (int i = 0; i < 16; ++i) { + if ((input0.get(i) + input1.get(i)) != output0.get(i)) { + FAIL("incorrect sum in " + output0_name); + } + if ((input0.get(i) - input1.get(i)) != output1.get(i)) { + FAIL("incorrect difference in " + output1_name); + } + } + } + + static void Check( + TRITONSERVER_InferenceResponse response, Pointer input0_data, + Pointer input1_data, String output0, String output1, + long expected_byte_size, int expected_datatype, boolean is_int) + { + HashMap output_data = new HashMap<>(); + + int[] output_count = {0}; + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutputCount(response, output_count), + "getting number of response outputs"); + if (output_count[0] != 2) { + FAIL("expecting 2 response outputs, got " + output_count[0]); + } + + for (int idx = 0; idx < output_count[0]; ++idx) { + BytePointer cname = new BytePointer((Pointer) null); + IntPointer datatype = new IntPointer(1); + LongPointer shape = new LongPointer((Pointer) null); + LongPointer dim_count = new LongPointer(1); + Pointer base = new Pointer(); + SizeTPointer byte_size = new SizeTPointer(1); + IntPointer memory_type = new IntPointer(1); + LongPointer memory_type_id = new LongPointer(1); + Pointer userp = new Pointer(); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutput( + response, idx, cname, datatype, shape, dim_count, base, byte_size, + memory_type, memory_type_id, userp), + "getting output info"); + + if (cname.isNull()) { + FAIL("unable to get output name"); + } + + String name = cname.getString(); + if ((!name.equals(output0)) && (!name.equals(output1))) { + FAIL("unexpected output '" + name + "'"); + } + + if ((dim_count.get() != 2) || (shape.get(0) != 1) + || (shape.get(1) != 16)) { + FAIL("unexpected shape for '" + name + "'"); + } + + if (datatype.get() != expected_datatype) { + FAIL( + "unexpected datatype '" + + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name + + "'"); + } + + if (byte_size.get() != expected_byte_size) { + FAIL( + "unexpected byte-size, expected " + expected_byte_size + ", got " + + byte_size.get() + " for " + name); + } + + if (enforce_memory_type && (memory_type.get() != requested_memory_type)) { + FAIL( + "unexpected memory type, expected to be allocated in " + + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got " + + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + + memory_type_id.get() + " for " + name); + } + + // We make a copy of the data here... which we could avoid for + // performance reasons but ok for this simple example. + BytePointer odata = new BytePointer(byte_size.get()); + output_data.put(name, odata); + odata.put(base.limit(byte_size.get())); + } + + if (is_int) { + CompareResult( + output0, output1, new IntPointer(input0_data), + new IntPointer(input1_data), new IntPointer(output_data.get(output0)), + new IntPointer(output_data.get(output1))); + } else { + CompareResult( + output0, output1, new FloatPointer(input0_data), + new FloatPointer(input1_data), + new FloatPointer(output_data.get(output0)), + new FloatPointer(output_data.get(output1))); + } + } + + /** + Returns whether the memory growth is within the acceptable range + @param max_float_allowed Maximum allowed memory growth (%) + @param max_mem_allowed Maximum allowed memory (MB) + */ + static boolean ValidateMemoryGrowth( + float max_growth_allowed, int max_mem_allowed) + { + // Allocate list starting capacity to hold up to 24 hours worth of + // snapshots. + List memory_snapshots = new ArrayList(20000); + while (!done) { + try { + Thread.sleep(5000); + } + catch (InterruptedException e) { + System.out.println("Memory growth validation interrupted."); + } + System.gc(); + double snapshot = Runtime.getRuntime().totalMemory() + - Runtime.getRuntime().freeMemory(); + memory_snapshots.add(snapshot); + System.out.println("Memory allocated (MB):" + snapshot / 1E6); + } + if (memory_snapshots.size() < 5) { + System.out.println( + "Error: Not enough snapshots, found " + memory_snapshots.size() + + " snapshots"); + return false; + } + + // Measure memory growth without outliers by taking difference + // between 90th percentile and 10th percentile memory usage. + final double bytes_in_mb = 1E6; + Collections.sort(memory_snapshots); + int index_max = + ((int) Math.ceil(max_percentile / 100.0 * memory_snapshots.size())) - 1; + int index_min = + ((int) Math.ceil(min_percentile / 100.0 * memory_snapshots.size())) - 1; + double memory_allocation_delta = + memory_snapshots.get(index_max) - memory_snapshots.get(index_min); + double memory_allocation_delta_mb = memory_allocation_delta / bytes_in_mb; + double memory_allocation_delta_percent = + memory_allocation_delta / memory_snapshots.get(index_max); + + System.out.println( + "Change in memory allocation (MB): " + memory_allocation_delta_mb + ", " + + (memory_allocation_delta_percent * 100) + "%"); + + boolean passed = true; + + if (memory_allocation_delta_percent >= max_growth_allowed) { + passed = false; + System.out.println( + "Exceeded allowed memory growth (" + (max_growth_allowed * 100) + + "%)"); + } + + if ((memory_snapshots.get(index_max) / bytes_in_mb) >= max_mem_allowed) { + passed = false; + System.out.println( + "Exceeded allowed memory (" + max_mem_allowed + "MB), got " + + (memory_snapshots.get(index_max) / bytes_in_mb) + "MB"); + } + return passed; + } + + static void RunInference( + TRITONSERVER_ServerDeleter server, String model_name, boolean[] is_int, + boolean[] is_torch_model, boolean check_accuracy) throws Exception + { + // Create the allocator that will be used to allocate buffers for + // the result tensors. + TRITONSERVER_ResponseAllocator allocator = + new TRITONSERVER_ResponseAllocator(null); + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorNew( + allocator, responseAlloc, responseRelease, null /* start_fn */), + "creating response allocator"); + + // Inference + TRITONSERVER_InferenceRequest irequest = + new TRITONSERVER_InferenceRequest(null); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestNew( + irequest, server, model_name, -1 /* model_version */), + "creating inference request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"), + "setting ID for the request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetReleaseCallback( + irequest, inferRequestComplete, null /* request_release_userp */), + "setting request release callback"); + + // Inputs + String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT0"; + String input1 = is_torch_model[0] ? "INPUT__1" : "INPUT1"; + + long[] input0_shape = {1, 16}; + long[] input1_shape = {1, 16}; + + int datatype = + (is_int[0]) ? TRITONSERVER_TYPE_INT32 : TRITONSERVER_TYPE_FP32; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddInput( + irequest, input0, datatype, input0_shape, input0_shape.length), + "setting input 0 meta-data for the request"); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddInput( + irequest, input1, datatype, input1_shape, input1_shape.length), + "setting input 1 meta-data for the request"); + + String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT0"; + String output1 = is_torch_model[0] ? "OUTPUT__1" : "OUTPUT1"; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0), + "requesting output 0 for the request"); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output1), + "requesting output 1 for the request"); + + // Create the data for the two input tensors. Initialize the first + // to unique values and the second to all ones. + BytePointer input0_data; + BytePointer input1_data; + if (is_int[0]) { + IntPointer[] p0 = {null}, p1 = {null}; + GenerateInputData(p0, p1); + input0_data = p0[0].getPointer(BytePointer.class); + input1_data = p1[0].getPointer(BytePointer.class); + } else { + FloatPointer[] p0 = {null}, p1 = {null}; + GenerateInputData(p0, p1); + input0_data = p0[0].getPointer(BytePointer.class); + input1_data = p1[0].getPointer(BytePointer.class); + } + + long input0_size = input0_data.limit(); + long input1_size = input1_data.limit(); + + Pointer input0_base = input0_data; + Pointer input1_base = input1_data; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAppendInputData( + irequest, input0, input0_base, input0_size, requested_memory_type, + 0 /* memory_type_id */), + "assigning INPUT0 data"); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAppendInputData( + irequest, input1, input1_base, input1_size, requested_memory_type, + 0 /* memory_type_id */), + "assigning INPUT1 data"); + + // Perform inference... + { + CompletableFuture completed = + new CompletableFuture<>(); + futures.put(irequest, completed); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetResponseCallback( + irequest, allocator, null /* response_allocator_userp */, + inferResponseComplete, irequest), + "setting response callback"); + + FAIL_IF_ERR( + TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), + "running inference"); + + // Wait for the inference to complete. + TRITONSERVER_InferenceResponse completed_response = completed.get(); + futures.remove(irequest); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseError(completed_response), + "response status"); + if (check_accuracy) { + Check( + completed_response, input0_data, input1_data, output0, output1, + input0_size, datatype, is_int[0]); + } + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseDelete(completed_response), + "deleting inference response"); + } + + // Modify some input data in place and then reuse the request + // object. For simplicity we only do this when the input tensors are + // in non-pinned system memory. + if (!enforce_memory_type + || (requested_memory_type == TRITONSERVER_MEMORY_CPU)) { + if (is_int[0]) { + new IntPointer(input0_data).put(0, 27); + } else { + new FloatPointer(input0_data).put(0, 27.0f); + } + + CompletableFuture completed = + new CompletableFuture<>(); + futures.put(irequest, completed); + + // Using a new promise so have to re-register the callback to set + // the promise as the userp. + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetResponseCallback( + irequest, allocator, null /* response_allocator_userp */, + inferResponseComplete, irequest), + "setting response callback"); + + FAIL_IF_ERR( + TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), + "running inference"); + + // Wait for the inference to complete. + TRITONSERVER_InferenceResponse completed_response = completed.get(); + futures.remove(irequest); + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseError(completed_response), + "response status"); + if (check_accuracy) { + Check( + completed_response, input0_data, input1_data, output0, output1, + input0_size, datatype, is_int[0]); + } + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseDelete(completed_response), + "deleting inference response"); + } + + // Remove input data and then add back different data. + { + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestRemoveAllInputData(irequest, input0), + "removing INPUT0 data"); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAppendInputData( + irequest, input0, input1_base, input1_size, requested_memory_type, + 0 /* memory_type_id */), + "assigning INPUT1 data to INPUT0"); + + CompletableFuture completed = + new CompletableFuture<>(); + futures.put(irequest, completed); + + // Using a new promise so have to re-register the callback to set + // the promise as the userp. + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetResponseCallback( + irequest, allocator, null /* response_allocator_userp */, + inferResponseComplete, irequest), + "setting response callback"); + + FAIL_IF_ERR( + TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), + "running inference"); + + // Wait for the inference to complete. + TRITONSERVER_InferenceResponse completed_response = completed.get(); + futures.remove(irequest); + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseError(completed_response), + "response status"); + + if (check_accuracy) { + // Both inputs are using input1_data... + Check( + completed_response, input1_data, input1_data, output0, output1, + input0_size, datatype, is_int[0]); + } + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseDelete(completed_response), + "deleting inference response"); + } + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestDelete(irequest), + "deleting inference request"); + + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorDelete(allocator), + "deleting response allocator"); + } + + public static void main(String[] args) throws Exception + { + int num_iterations = 1000000; + String model_repository_path = null; + int verbose_level = 0; + boolean check_accuracy = false; + + // Parse commandline... + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "-i": + i++; + try { + num_iterations = Integer.parseInt(args[i]); + } + catch (NumberFormatException e) { + Usage("-i must be used to specify number of iterations"); + } + break; + case "-m": + enforce_memory_type = true; + i++; + if (args[i].equals("system")) { + requested_memory_type = TRITONSERVER_MEMORY_CPU; + } else if (args[i].equals("pinned")) { + requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; + } else if (args[i].equals("gpu")) { + requested_memory_type = TRITONSERVER_MEMORY_GPU; + } else { + Usage( + "-m must be used to specify one of the following types:" + + " <\"system\"|\"pinned\"|gpu>"); + } + break; + case "-r": + model_repository_path = args[++i]; + break; + case "-v": + verbose_level = 1; + break; + case "-c": + check_accuracy = true; + break; + case "-?": + Usage(null); + break; + case "--max-growth": + i++; + try { + max_growth_allowed = Integer.parseInt(args[i]) / 100.0f; + } + catch (NumberFormatException e) { + Usage( + "--max-growth must be an integer value specifying allowed memory growth (%)"); + } + break; + case "--max-memory": + i++; + try { + max_mem_allowed = Integer.parseInt(args[i]); + } + catch (NumberFormatException e) { + Usage( + "--max-memory must be an integer value specifying maximum allowed memory (MB)"); + } + break; + } + } + + if (model_repository_path == null) { + Usage("-r must be used to specify model repository path"); + } + if (enforce_memory_type + && requested_memory_type != TRITONSERVER_MEMORY_CPU) { + Usage("-m can only be set to \"system\" without enabling GPU"); + } + + // Check API version. + int[] api_version_major = {0}, api_version_minor = {0}; + FAIL_IF_ERR( + TRITONSERVER_ApiVersion(api_version_major, api_version_minor), + "getting Triton API version"); + if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) + || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) { + FAIL("triton server API version mismatch"); + } + + // Create the server... + TRITONSERVER_ServerOptions server_options = + new TRITONSERVER_ServerOptions(null); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsNew(server_options), + "creating server options"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetModelRepositoryPath( + server_options, model_repository_path), + "setting model repository path"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level), + "setting verbose logging level"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetBackendDirectory( + server_options, "/opt/tritonserver/backends"), + "setting backend directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetRepoAgentDirectory( + server_options, "/opt/tritonserver/repoagents"), + "setting repository agent directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true), + "setting strict model configuration"); + double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY; + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability( + server_options, min_compute_capability), + "setting minimum supported CUDA compute capability"); + + TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null); + FAIL_IF_ERR( + TRITONSERVER_ServerNew(server_ptr, server_options), "creating server"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsDelete(server_options), + "deleting server options"); + + TRITONSERVER_ServerDeleter server = + new TRITONSERVER_ServerDeleter(server_ptr); + + // Wait until the server is both live and ready. + int health_iters = 0; + while (true) { + boolean[] live = {false}, ready = {false}; + FAIL_IF_ERR( + TRITONSERVER_ServerIsLive(server, live), + "unable to get server liveness"); + FAIL_IF_ERR( + TRITONSERVER_ServerIsReady(server, ready), + "unable to get server readiness"); + System.out.println( + "Server Health: live " + live[0] + ", ready " + ready[0]); + if (live[0] && ready[0]) { + break; + } + + if (++health_iters >= 10) { + FAIL("failed to find healthy inference server"); + } + + Thread.sleep(500); + } + + // Print status of the server. + { + TRITONSERVER_Message server_metadata_message = + new TRITONSERVER_Message(null); + FAIL_IF_ERR( + TRITONSERVER_ServerMetadata(server, server_metadata_message), + "unable to get server metadata message"); + BytePointer buffer = new BytePointer((Pointer) null); + SizeTPointer byte_size = new SizeTPointer(1); + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + server_metadata_message, buffer, byte_size), + "unable to serialize server metadata message"); + + System.out.println("Server Status:"); + System.out.println(buffer.limit(byte_size.get()).getString()); + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(server_metadata_message), + "deleting status metadata"); + } + + String model_name = "simple"; + + // Wait for the model to become available. + boolean[] is_torch_model = {false}; + boolean[] is_int = {true}; + boolean[] is_ready = {false}; + health_iters = 0; + while (!is_ready[0]) { + FAIL_IF_ERR( + TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready), + "unable to get model readiness"); + if (!is_ready[0]) { + if (++health_iters >= 10) { + FAIL("model failed to be ready in 10 iterations"); + } + Thread.sleep(500); + continue; + } + + TRITONSERVER_Message model_metadata_message = + new TRITONSERVER_Message(null); + FAIL_IF_ERR( + TRITONSERVER_ServerModelMetadata( + server, model_name, 1, model_metadata_message), + "unable to get model metadata message"); + BytePointer buffer = new BytePointer((Pointer) null); + SizeTPointer byte_size = new SizeTPointer(1); + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + model_metadata_message, buffer, byte_size), + "unable to serialize model status protobuf"); + + JsonParser parser = new JsonParser(); + JsonObject model_metadata = null; + try { + model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()) + .getAsJsonObject(); + } + catch (Exception e) { + FAIL("error: failed to parse model metadata from JSON: " + e); + } + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(model_metadata_message), + "deleting status protobuf"); + + if (!model_metadata.get("name").getAsString().equals(model_name)) { + FAIL("unable to find metadata for model"); + } + + boolean found_version = false; + if (model_metadata.has("versions")) { + for (JsonElement version : + model_metadata.get("versions").getAsJsonArray()) { + if (version.getAsString().equals("1")) { + found_version = true; + break; + } + } + } + if (!found_version) { + FAIL("unable to find version 1 status for model"); + } + + FAIL_IF_ERR( + ParseModelMetadata(model_metadata, is_int, is_torch_model), + "parsing model metadata"); + } + + Runnable runnable = () -> + { + boolean passed = + ValidateMemoryGrowth(max_growth_allowed, max_mem_allowed); + + // Sleep to give the garbage collector time to free the server. + // This avoids race conditions between Triton bindings' printing and + // Java's native printing below. + try { + Thread.sleep(5000); + } + catch (InterruptedException e) { + System.out.println("Sleep interrupted: " + e.toString()); + } + + if (passed) { + System.out.println("Memory growth test passed"); + } else { + System.out.println("Memory growth test FAILED"); + } + }; + Thread memory_thread = new Thread(runnable); + memory_thread.start(); + + for (int i = 0; i < num_iterations; i++) { + try (PointerScope scope = new PointerScope()) { + RunInference( + server, model_name, is_int, is_torch_model, check_accuracy); + } + } + done = true; + memory_thread.join(); + + System.exit(0); + } +} diff --git a/qa/L0_java_memory_growth/test.sh b/qa/L0_java_memory_growth/test.sh new file mode 100755 index 0000000000..d5ec33a5d5 --- /dev/null +++ b/qa/L0_java_memory_growth/test.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Set up test files based on installation instructions +# https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/README.md +JAVACPP_BRANCH=${JAVACPP_BRANCH:="https://github.com/bytedeco/javacpp-presets.git"} +JAVACPP_BRANCH_TAG=${JAVACPP_BRANCH_TAG:="master"} +set -e +git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git +source client/src/java-api-bindings/scripts/install_dependencies_and_build.sh -b $PWD --javacpp-branch ${JAVACPP_BRANCH} --javacpp-tag ${JAVACPP_BRANCH_TAG} --keep-build-dependencies +cd .. + +export MAVEN_OPTS="-XX:MaxGCPauseMillis=40" +MODEL_REPO=`pwd`/models +SAMPLES_REPO=`pwd`/javacpp-presets/tritonserver/samples/simple +BASE_COMMAND="mvn clean compile -f $SAMPLES_REPO exec:java -Djavacpp.platform=linux-x86_64" +source ../common/util.sh + +# Create local model repository +rm -rf ${MODEL_REPO} +mkdir ${MODEL_REPO} +cp -r `pwd`/../L0_simple_ensemble/models/simple ${MODEL_REPO}/. + +cp MemoryGrowthTest.java $SAMPLES_REPO +sed -i 's/Simple/MemoryGrowthTest/g' $SAMPLES_REPO/pom.xml + +rm -f *.log +RET=0 + + +# Sanity test: check accuracy +ITERS=200000 + +LOG_IDX=0 +CLIENT_LOG="./client_$LOG_IDX.log" + +echo -e "\nRunning Sanity Test (accuracy checking)\n" +$BASE_COMMAND -Dexec.args="-r $MODEL_REPO -i $ITERS" >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to run sanity test to complete\n***" + RET=1 +fi + +if [ `grep -c "Memory growth test passed" $CLIENT_LOG` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 'Memory growth test passed' in $CLIENT_LOG\n***" + cat $CLIENT_LOG + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) +CLIENT_LOG="./client_$LOG_IDX.log" + +# Longer-running memory growth test +ITERS=1000000 +MAX_MEM_GROWTH_MB=10 +if [ "$TRITON_PERF_LONG" == 1 ]; then + # ~1 day + ITERS=150000000 + MAX_MEM_GROWTH_MB=25 +fi + +echo -e "\nRunning Memory Growth Test, $ITERS Iterations\n" +$BASE_COMMAND -Dexec.args="-r $MODEL_REPO -c -i $ITERS --max-growth $MAX_MEM_GROWTH_MB" >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to run memory growth test to complete\n***" + RET=1 +fi + +if [ `grep -c "Memory growth test passed" $CLIENT_LOG` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 'Memory growth test passed' in $CLIENT_LOG\n***" + cat $CLIENT_LOG + RET=1 +fi + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_java_resnet/ResnetTest.java b/qa/L0_java_resnet/ResnetTest.java new file mode 100644 index 0000000000..4827273926 --- /dev/null +++ b/qa/L0_java_resnet/ResnetTest.java @@ -0,0 +1,639 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import static org.bytedeco.tritonserver.global.tritonserver.*; + +import com.google.gson.*; +import java.io.*; +import java.util.*; +import java.util.concurrent.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.tritonserver.tritonserver.*; + +public class ResnetTest { + // Maximum allowed difference from expected model outputs + private static final float ALLOWED_DELTA = .001f; + private static final String[] MODELS = { + "resnet50_fp32_libtorch", "resnet50_fp32_onnx", + // TODO: fix build to support GPU only resnet50v1.5_fp16_savedmodel + //"resnet50v1.5_fp16_savedmodel", + }; + private static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0; + private enum Backend { + NONE, + ONNX, + TF, + TORCH, + } + + static void FAIL(String MSG) + { + System.err.println("failure: " + MSG); + System.exit(1); + } + + static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) + { + if (err__ != null) { + System.err.println( + "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - " + + TRITONSERVER_ErrorMessage(err__)); + TRITONSERVER_ErrorDelete(err__); + System.exit(1); + } + } + + static boolean enforce_memory_type = false; + static int requested_memory_type; + + static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server { + public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) + { + super(p); + deallocator(new DeleteDeallocator(this)); + } + protected static class DeleteDeallocator + extends TRITONSERVER_Server implements Deallocator { + DeleteDeallocator(Pointer p) { super(p); } + @Override public void deallocate() { TRITONSERVER_ServerDelete(this); } + } + } + + static void Usage(String msg) + { + if (msg != null) { + System.err.println(msg); + } + + System.err.println( + "Usage: java " + ResnetTest.class.getSimpleName() + " [options]"); + System.err.println( + "\t-m <\"system\"|\"pinned\"|gpu>" + + " Enforce the memory type for input and output tensors." + + " If not specified, inputs will be in system memory and outputs" + + " will be based on the model's preferred type."); + System.err.println("\t-v Enable verbose logging"); + System.err.println("\t-r [model repository absolute path]"); + + System.exit(1); + } + + static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t { + @Override + public TRITONSERVER_Error call( + TRITONSERVER_ResponseAllocator allocator, String tensor_name, + long byte_size, int preferred_memory_type, + long preferred_memory_type_id, Pointer userp, PointerPointer buffer, + PointerPointer buffer_userp, IntPointer actual_memory_type, + LongPointer actual_memory_type_id) + { + // Initially attempt to make the actual memory type and id that we + // allocate be the same as preferred memory type + actual_memory_type.put(0, preferred_memory_type); + actual_memory_type_id.put(0, preferred_memory_type_id); + + // If 'byte_size' is zero just return 'buffer' == nullptr, we don't + // need to do any other book-keeping. + if (byte_size == 0) { + buffer.put(0, null); + buffer_userp.put(0, null); + System.out.println( + "allocated " + byte_size + " bytes for result tensor " + + tensor_name); + } else { + Pointer allocated_ptr = new Pointer(); + if (enforce_memory_type) { + actual_memory_type.put(0, requested_memory_type); + } + + actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU); + allocated_ptr = Pointer.malloc(byte_size); + + // Pass the tensor name with buffer_userp so we can show it when + // releasing the buffer. + if (!allocated_ptr.isNull()) { + buffer.put(0, allocated_ptr); + buffer_userp.put(0, Loader.newGlobalRef(tensor_name)); + System.out.println( + "allocated " + byte_size + " bytes in " + + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + + " for result tensor " + tensor_name); + } + } + + return null; // Success + } + } + + static class ResponseRelease + extends TRITONSERVER_ResponseAllocatorReleaseFn_t { + @Override + public TRITONSERVER_Error call( + TRITONSERVER_ResponseAllocator allocator, Pointer buffer, + Pointer buffer_userp, long byte_size, int memory_type, + long memory_type_id) + { + String name = null; + if (buffer_userp != null) { + name = (String) Loader.accessGlobalRef(buffer_userp); + } else { + name = ""; + } + + Pointer.free(buffer); + Loader.deleteGlobalRef(buffer_userp); + + return null; // Success + } + } + + static class InferRequestComplete + extends TRITONSERVER_InferenceRequestReleaseFn_t { + @Override + public void call( + TRITONSERVER_InferenceRequest request, int flags, Pointer userp) + { + // We reuse the request so we don't delete it here. + } + } + + static class InferResponseComplete + extends TRITONSERVER_InferenceResponseCompleteFn_t { + @Override + public void call( + TRITONSERVER_InferenceResponse response, int flags, Pointer userp) + { + if (response != null) { + // Send 'response' to the future. + futures.get(userp).complete(response); + } + } + } + + static ConcurrentHashMap< + Pointer, CompletableFuture> futures = + new ConcurrentHashMap<>(); + static ResponseAlloc responseAlloc = new ResponseAlloc(); + static ResponseRelease responseRelease = new ResponseRelease(); + static InferRequestComplete inferRequestComplete = new InferRequestComplete(); + static InferResponseComplete inferResponseComplete = + new InferResponseComplete(); + + static void GenerateInputData(FloatPointer[] input_data) + { + // Input size is 3 * 224 * 224 + input_data[0] = new FloatPointer(150528); + for (int i = 0; i < 150528; ++i) { + input_data[0].put(i, 1); + } + } + + static boolean AreValidResults( + String model_name, FloatPointer output, FloatPointer expected_output) + { + int output_length = model_name.contains("tensorflow") ? 1001 : 1000; + for (int i = 0; i < output_length; ++i) { + float difference = output.get(i) - expected_output.get(i); + if (difference > ALLOWED_DELTA) { + System.out.println( + model_name + "inference failure: unexpected output " + + "in " + model_name + ", index " + i); + + System.out.println( + "Value: " + output.get(i) + ", expected " + expected_output.get(i)); + + return false; // Failure + } + } + return true; // Success + } + + static void Check( + String model_name, Backend backend, + TRITONSERVER_InferenceResponse response, Pointer input_data, + String output, int expected_datatype) throws Exception + { + HashMap output_data = new HashMap<>(); + + int[] output_count = {0}; + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutputCount(response, output_count), + "getting number of response outputs"); + if (output_count[0] != 1) { + FAIL("expecting 1 response output, got " + output_count[0]); + } + + for (int idx = 0; idx < output_count[0]; ++idx) { + BytePointer cname = new BytePointer((Pointer) null); + IntPointer datatype = new IntPointer(1); + LongPointer shape = new LongPointer((Pointer) null); + LongPointer dim_count = new LongPointer(1); + Pointer base = new Pointer(); + SizeTPointer byte_size = new SizeTPointer(1); + IntPointer memory_type = new IntPointer(1); + LongPointer memory_type_id = new LongPointer(1); + Pointer userp = new Pointer(); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutput( + response, idx, cname, datatype, shape, dim_count, base, byte_size, + memory_type, memory_type_id, userp), + "getting output info"); + + if (cname.isNull()) { + FAIL("unable to get output name"); + } + + String name = cname.getString(); + if (!name.equals(output)) { + FAIL("unexpected output '" + name + "'"); + } + + int output_length = backend == backend.TF ? 1001 : 1000; + + if ((dim_count.get() != 2) || (shape.get(0) != 1) + || shape.get(1) != output_length) { + FAIL("unexpected shape for '" + name + "'"); + } + + if (datatype.get() != expected_datatype) { + FAIL( + "unexpected datatype '" + + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name + + "'"); + } + + if (enforce_memory_type && (memory_type.get() != requested_memory_type)) { + FAIL( + "unexpected memory type, expected to be allocated in " + + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got " + + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + + memory_type_id.get() + " for " + name); + } + + // We make a copy of the data here... which we could avoid for + // performance reasons but ok for this simple example. + BytePointer odata = new BytePointer(byte_size.get()); + output_data.put(name, odata); + odata.put(base.limit(byte_size.get())); + } + + // Expected output for model + String file_name = "expected_output_data/expected_output_"; + switch (backend) { + case ONNX: + file_name += "onnx"; + break; + case TF: + file_name += "tensorflow"; + break; + case TORCH: + file_name += "pytorch"; + break; + default: + FAIL("Unsupported model type"); + break; + } + file_name += ".txt"; + + int output_length = backend == backend.TF ? 1001 : 1000; + FloatPointer expected_output = new FloatPointer(output_length); + + try (Scanner scanner = new Scanner(new File(file_name))) { + for (int i = 0; i < output_length; ++i) { + expected_output.put(i, scanner.nextFloat()); + } + } + + boolean correct_results = AreValidResults( + model_name, new FloatPointer(output_data.get(output)), expected_output); + + if (correct_results) { + System.out.println(backend.name() + " test PASSED"); + } else { + System.out.println(backend.name() + " test FAILED"); + } + } + + static void PerformInference( + TRITONSERVER_ServerDeleter server, String model_name) throws Exception + { + // Get type of model + Backend backend = Backend.NONE; + if (model_name.contains("onnx")) { + backend = Backend.ONNX; + } else if (model_name.contains("savedmodel")) { + backend = Backend.TF; + } else if (model_name.contains("torch")) { + backend = Backend.TORCH; + } else { + FAIL( + "Supported model types (Onnx, TensorFlow, Torch) " + + "cannot be inferred from model name " + model_name); + } + + // Wait for the model to become available. + boolean[] is_ready = {false}; + int health_iters = 0; + while (!is_ready[0]) { + FAIL_IF_ERR( + TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready), + "unable to get model readiness"); + if (!is_ready[0]) { + if (++health_iters >= 10) { + FAIL(model_name + " model failed to be ready in 10 iterations"); + } + Thread.sleep(500); + continue; + } + } + + // Create the allocator that will be used to allocate buffers for + // the result tensors. + TRITONSERVER_ResponseAllocator allocator = + new TRITONSERVER_ResponseAllocator(null); + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorNew( + allocator, responseAlloc, responseRelease, null /* start_fn */), + "creating response allocator"); + + // Inference + TRITONSERVER_InferenceRequest irequest = + new TRITONSERVER_InferenceRequest(null); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestNew( + irequest, server, model_name, -1 /* model_version */), + "creating inference request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"), + "setting ID for the request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetReleaseCallback( + irequest, inferRequestComplete, null /* request_release_userp */), + "setting request release callback"); + + + // Model inputs + String input = ""; + String output = ""; + long[] input_shape = {1, 224, 224, 3}; + + switch (backend) { + case ONNX: + input = "import/input:0"; + output = "import/resnet_v1_50/predictions/Softmax:0"; + break; + case TF: + input = "input"; + output = "probabilities"; + break; + case TORCH: + input = "INPUT__0"; + input_shape[1] = 3; + input_shape[3] = 224; + output = "OUTPUT__0"; + break; + default: + FAIL("Unsupported model type"); + break; + } + + int datatype = TRITONSERVER_TYPE_FP32; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddInput( + irequest, input, datatype, input_shape, input_shape.length), + "setting input 0 meta-data for the request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output), + "requesting output 0 for the request"); + + // Create the data for the two input tensors. Initialize the first + // to unique values and the second to all ones. + BytePointer input_data; + FloatPointer[] p0 = {null}; + GenerateInputData(p0); + input_data = p0[0].getPointer(BytePointer.class); + long input_size = input_data.limit(); + Pointer input_base = input_data; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAppendInputData( + irequest, input, input_base, input_size, requested_memory_type, + 0 /* memory_type_id */), + "assigning INPUT data"); + + // Perform inference... + { + CompletableFuture completed = + new CompletableFuture<>(); + futures.put(irequest, completed); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetResponseCallback( + irequest, allocator, null /* response_allocator_userp */, + inferResponseComplete, irequest), + "setting response callback"); + + FAIL_IF_ERR( + TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), + "running inference"); + + // Wait for the inference to complete. + TRITONSERVER_InferenceResponse completed_response = completed.get(); + futures.remove(irequest); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseError(completed_response), + "response status"); + + Check( + model_name, backend, completed_response, input_data, output, + datatype); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseDelete(completed_response), + "deleting inference response"); + } + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestDelete(irequest), + "deleting inference request"); + + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorDelete(allocator), + "deleting response allocator"); + } + + public static void main(String[] args) throws Exception + { + String model_repository_path = null; + int verbose_level = 0; + + // Parse commandline... + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "-m": { + enforce_memory_type = true; + i++; + if (args[i].equals("system")) { + requested_memory_type = TRITONSERVER_MEMORY_CPU; + } else if (args[i].equals("pinned")) { + requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED; + } else if (args[i].equals("gpu")) { + requested_memory_type = TRITONSERVER_MEMORY_GPU; + } else { + Usage( + "-m must be used to specify one of the following types:" + + " <\"system\"|\"pinned\"|gpu>"); + } + break; + } + case "-r": + model_repository_path = args[++i]; + break; + case "-v": + verbose_level = 1; + break; + case "-?": + Usage(null); + break; + } + } + + if (model_repository_path == null) { + Usage("-r must be used to specify model repository path"); + } + if (enforce_memory_type + && requested_memory_type != TRITONSERVER_MEMORY_CPU) { + Usage("-m can only be set to \"system\" without enabling GPU"); + } + + // Check API version. + int[] api_version_major = {0}, api_version_minor = {0}; + FAIL_IF_ERR( + TRITONSERVER_ApiVersion(api_version_major, api_version_minor), + "getting Triton API version"); + if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) + || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) { + FAIL("triton server API version mismatch"); + } + + // Create the server... + TRITONSERVER_ServerOptions server_options = + new TRITONSERVER_ServerOptions(null); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsNew(server_options), + "creating server options"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetModelRepositoryPath( + server_options, model_repository_path), + "setting model repository path"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level), + "setting verbose logging level"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetBackendDirectory( + server_options, "/opt/tritonserver/backends"), + "setting backend directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetRepoAgentDirectory( + server_options, "/opt/tritonserver/repoagents"), + "setting repository agent directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true), + "setting strict model configuration"); + double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY; + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability( + server_options, min_compute_capability), + "setting minimum supported CUDA compute capability"); + + TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null); + FAIL_IF_ERR( + TRITONSERVER_ServerNew(server_ptr, server_options), "creating server"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsDelete(server_options), + "deleting server options"); + + TRITONSERVER_ServerDeleter server = + new TRITONSERVER_ServerDeleter(server_ptr); + + // Wait until the server is both live and ready. + int health_iters = 0; + while (true) { + boolean[] live = {false}, ready = {false}; + FAIL_IF_ERR( + TRITONSERVER_ServerIsLive(server, live), + "unable to get server liveness"); + FAIL_IF_ERR( + TRITONSERVER_ServerIsReady(server, ready), + "unable to get server readiness"); + System.out.println( + "Server Health: live " + live[0] + ", ready " + ready[0]); + if (live[0] && ready[0]) { + break; + } + + if (++health_iters >= 10) { + FAIL("failed to find healthy inference server"); + } + + Thread.sleep(500); + } + + // Print status of the server. + { + TRITONSERVER_Message server_metadata_message = + new TRITONSERVER_Message(null); + FAIL_IF_ERR( + TRITONSERVER_ServerMetadata(server, server_metadata_message), + "unable to get server metadata message"); + BytePointer buffer = new BytePointer((Pointer) null); + SizeTPointer byte_size = new SizeTPointer(1); + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + server_metadata_message, buffer, byte_size), + "unable to serialize server metadata message"); + + System.out.println("Server Status:"); + System.out.println(buffer.limit(byte_size.get()).getString()); + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(server_metadata_message), + "deleting status metadata"); + } + + for (String model : MODELS) { + PerformInference(server, model); + } + + System.exit(0); + } +} diff --git a/qa/L0_java_resnet/expected_output_data/expected_output_onnx.txt b/qa/L0_java_resnet/expected_output_data/expected_output_onnx.txt new file mode 100644 index 0000000000..0d438e670e --- /dev/null +++ b/qa/L0_java_resnet/expected_output_data/expected_output_onnx.txt @@ -0,0 +1,1000 @@ +0.00016980497 +0.0003274878 +5.9229897e-05 +0.00010386822 +5.1683604e-05 +0.0005200729 +9.252152e-06 +3.5043122e-05 +1.7310056e-05 +0.00014115982 +0.0007192011 +0.00014146192 +5.864904e-05 +8.102552e-05 +1.6766031e-05 +4.9913597e-05 +0.00012557638 +2.9249455e-05 +5.8186713e-05 +4.997704e-05 +0.00019478115 +0.001593597 +0.0009770031 +0.00022523475 +8.752639e-05 +0.00011251909 +0.00031572866 +0.00023567723 +0.00017536257 +0.00018674227 +4.333203e-05 +0.00033384693 +8.9560366e-05 +0.00011413613 +0.00028333033 +1.6440303e-05 +0.000121921854 +1.1142264e-05 +0.0059000477 +3.741594e-05 +4.867915e-05 +0.00020082401 +0.00023553567 +0.00016318199 +5.550063e-05 +0.00012654626 +4.0553005e-05 +0.00023072284 +3.575522e-05 +3.5885336e-05 +0.000167727 +0.0004284156 +0.00029606326 +0.0005308822 +0.00025003406 +2.4711164e-05 +5.9230402e-05 +1.4644651e-05 +0.00013812816 +0.0030018578 +0.0004657613 +3.8773556e-05 +0.00029646824 +0.00039393824 +0.0006814109 +0.00017464366 +0.000501648 +6.748e-05 +0.00021987612 +4.2551095e-05 +7.442098e-05 +0.00073552737 +6.145523e-05 +0.0019270201 +1.1406245e-05 +0.00033168247 +2.7609263e-05 +0.00055849075 +0.0018151653 +0.0012854141 +0.0005644851 +0.0002643019 +0.00012686521 +0.00031014124 +3.576934e-05 +1.5226503e-05 +0.00023373427 +0.00025264034 +9.125392e-05 +0.00010886967 +5.68172e-05 +0.00022797973 +0.0005024418 +0.00013592323 +0.00016360248 +4.724841e-05 +0.00016500028 +3.5815625e-05 +0.0009926121 +0.00018996779 +0.00032009778 +6.5463086e-05 +4.915879e-05 +0.0023545807 +0.00019779587 +9.740985e-06 +5.916514e-05 +8.342835e-05 +3.5502824e-05 +5.5155975e-05 +0.0002953913 +0.14522666 +0.00026150284 +0.0004633083 +0.00010001568 +7.724773e-05 +0.00020212827 +0.0003651837 +2.3665098e-05 +8.007319e-05 +0.000164059 +2.0919639e-05 +0.00015904989 +2.8181286e-05 +2.1252014e-05 +0.00016757358 +0.0026105049 +0.00016491314 +0.0033536772 +0.00045177306 +0.00015669028 +5.8178866e-05 +0.0005335324 +7.4700896e-05 +4.13347e-05 +0.00013332519 +0.00024123705 +0.00024420477 +0.00010249778 +0.00014476122 +0.00043495715 +0.00040990766 +0.00021976302 +0.00028396113 +0.00018292265 +0.0005685563 +0.0005261158 +0.0005394564 +0.0006722254 +0.00041848654 +0.0002058497 +0.00020697096 +0.00038915384 +0.00063642685 +3.952872e-05 +4.7074976e-05 +0.0001484932 +0.0001767462 +0.00018367334 +9.1362854e-05 +0.00020925087 +4.683706e-05 +8.098025e-05 +0.00038643452 +2.1166008e-05 +0.00023816078 +0.00040344987 +0.00014309994 +0.00016946216 +0.0001158025 +0.00015477811 +0.00013820603 +0.00039157816 +0.00012628519 +6.416812e-05 +5.319338e-05 +8.096635e-05 +9.268181e-05 +0.00021009038 +8.123741e-05 +2.1137266e-05 +0.00013860558 +9.888543e-06 +9.180427e-05 +5.726596e-05 +0.00024706984 +3.4873163e-05 +9.941785e-05 +0.0002506603 +0.00011764638 +0.00086345134 +0.00011305928 +3.6803817e-06 +8.0881466e-05 +0.00017012736 +0.0003054968 +5.9778555e-05 +1.0738367e-05 +3.9709514e-05 +7.807765e-05 +8.485133e-05 +1.4551556e-05 +5.0553328e-05 +0.0001432179 +0.00012594614 +4.33862e-05 +0.00016131556 +0.00012815706 +4.6910594e-05 +5.9233225e-05 +5.5869554e-05 +7.410936e-05 +9.937572e-05 +6.092812e-05 +6.309549e-05 +8.338313e-05 +0.00044575817 +5.111232e-05 +2.1025462e-05 +4.1145802e-05 +0.00019077354 +0.00019071896 +0.00025231927 +0.00019271992 +0.00013492932 +0.00010883319 +2.025502e-05 +0.0002089905 +9.62682e-05 +0.00012668235 +1.5566122e-05 +2.2314523e-05 +0.00017040399 +0.0001946466 +2.8189646e-05 +4.8383175e-05 +0.00013236424 +0.00016888845 +5.468688e-05 +0.00014190435 +8.5229825e-05 +5.173721e-05 +3.7611204e-05 +9.9274024e-05 +3.191364e-05 +6.1621664e-05 +0.00013842362 +6.9894915e-05 +9.658343e-05 +6.903254e-05 +0.0002400999 +0.00026015204 +0.000105622945 +0.0001664888 +0.00013265685 +1.5738156e-05 +0.0003335177 +0.00010971267 +0.0002484887 +0.00019186472 +8.8625755e-05 +6.912767e-05 +0.00045799493 +5.394646e-05 +0.00017973136 +8.907009e-05 +0.000110481764 +4.1266052e-05 +0.00013683847 +4.2938726e-05 +0.00012697978 +5.5856824e-05 +0.00014599289 +9.960172e-05 +0.00012956791 +0.00027035273 +0.00026089343 +0.00058428914 +9.604311e-05 +0.00030085753 +0.00013629998 +7.053258e-05 +0.00023789746 +0.00045626136 +0.00024321792 +0.00039255328 +9.378134e-05 +3.3330132e-05 +6.2762956e-05 +0.00010464993 +6.4440836e-05 +0.000114770344 +9.773856e-05 +0.00024476458 +0.00022140365 +8.682848e-05 +0.00014253015 +0.00041922313 +9.2946466e-05 +0.0007321677 +8.819961e-05 +0.00033927264 +0.0001434792 +0.0004997533 +5.05367e-05 +1.6199812e-05 +0.00081437116 +0.00029276052 +0.0003227374 +2.10321e-05 +0.00041501687 +7.6642566e-05 +0.0007460653 +0.00010704513 +0.0010337052 +0.00016585 +0.00010267203 +9.844521e-05 +0.00036912857 +0.0004210494 +0.0007636784 +8.831775e-06 +2.4511684e-05 +6.654908e-05 +3.845051e-05 +3.2900447e-05 +0.0002467062 +5.595124e-05 +0.00010915978 +1.5788999e-05 +0.00010652153 +0.0002424042 +0.0001448311 +1.1700289e-05 +3.8083996e-05 +9.013652e-05 +0.00016588188 +0.00014541998 +4.446017e-05 +5.857866e-05 +5.703819e-05 +6.140147e-05 +2.5429461e-05 +1.2527011e-05 +0.00029506863 +0.00017385624 +4.4041873e-05 +4.213424e-05 +7.223138e-05 +5.3147643e-05 +0.00028015298 +0.0005170326 +9.355127e-05 +0.00023953259 +0.00041169117 +6.737018e-05 +0.00097511435 +0.00044960703 +0.00041690134 +0.00036505918 +0.00035000656 +0.00020413095 +0.00014936135 +4.925268e-05 +4.6020858e-05 +0.0001434502 +3.7963135e-05 +0.00053391827 +3.7399033e-05 +0.000112552734 +8.935715e-05 +0.0008973427 +6.539161e-05 +0.00023165658 +0.0003438208 +6.735287e-05 +0.00016886953 +0.00042564265 +0.0001101864 +3.034124e-05 +0.000176773 +2.9307617e-05 +8.214749e-05 +7.6573786e-05 +0.00032455323 +0.00018222861 +3.7278707e-05 +0.00011895009 +6.777756e-05 +0.00040660411 +4.0756473e-05 +2.686724e-05 +0.0011102126 +1.7472128e-05 +3.215658e-05 +0.00019766577 +2.4107696e-05 +9.5941454e-05 +0.00013294643 +0.012934193 +0.0014889088 +0.00030110637 +0.0004861949 +0.00022020873 +0.0004120663 +0.0028884916 +2.075195e-05 +5.6945166e-05 +0.00010725547 +0.00061704434 +1.2163917e-05 +0.00013528275 +0.000321602 +0.0049974765 +0.00036395655 +7.939798e-06 +0.0027076406 +0.0009837962 +0.017314037 +0.00036551448 +0.00027795092 +0.00029623153 +0.00016959595 +0.00019360533 +3.4470788e-05 +6.317202e-05 +0.00028958637 +0.00052192796 +9.2430375e-05 +0.0010162767 +0.00010013961 +5.5248547e-05 +0.01881616 +0.000114972405 +0.00012866792 +0.0001735118 +9.917765e-05 +0.0011450195 +0.0015877285 +0.0017322781 +0.00056879356 +0.00025545148 +0.0007390253 +0.00012345372 +0.00022441847 +0.0001914855 +0.0026525552 +0.00044881727 +0.00034022957 +0.00028609563 +1.7402317e-05 +0.004177963 +5.312598e-05 +7.086197e-05 +1.07296755e-05 +0.0003122828 +0.0017724611 +0.0011016912 +2.7802036e-05 +0.00044330902 +2.7724009e-05 +0.00070999836 +0.0025074244 +0.00029760305 +0.0017468698 +0.0033079428 +0.00023698558 +1.8203225e-05 +4.298752e-05 +0.003792394 +0.0043774135 +0.0002578806 +6.7714565e-05 +0.010979906 +7.88773e-05 +0.00020034179 +3.9189625e-05 +7.9022284e-05 +0.00019010075 +0.00018935381 +0.000151655 +0.00063424406 +0.00010652139 +2.2907618e-05 +0.00021650721 +0.0006931013 +0.0016945377 +0.0018049135 +0.0016268345 +1.3866996e-05 +0.00023594845 +0.00077581 +0.00037083545 +0.0002482703 +0.00199448 +8.8194734e-05 +4.5612232e-05 +8.859373e-05 +5.174079e-05 +0.027618717 +7.469677e-05 +0.004212717 +0.00029109194 +0.0042880555 +0.0015089285 +0.0005760798 +0.0002101491 +0.0030193415 +0.0002710225 +1.43144425e-05 +0.0012474942 +7.6482655e-05 +0.012027938 +0.0013138817 +0.00024912177 +0.00039606096 +0.00017222571 +0.00077096495 +9.616005e-05 +0.00012808497 +0.00011093941 +0.0004788455 +0.00027597338 +0.0018378077 +0.00048597282 +2.693032e-05 +0.00015658996 +0.00045992344 +4.849936e-05 +0.00023919567 +0.0032133528 +0.0044528083 +0.00015469016 +8.7847635e-05 +0.0121315615 +0.00018360339 +2.8868575e-05 +7.337089e-05 +0.000533506 +0.0002060245 +0.001834617 +0.0014196439 +0.00109954 +0.0014719801 +0.00013069775 +0.000612675 +0.0007288255 +4.03345e-05 +6.908545e-05 +0.0045452276 +0.00020541927 +0.0022583636 +0.00011107671 +0.00054280076 +0.00014280484 +5.260433e-05 +0.0013882591 +0.0004975726 +0.004215462 +0.00118553 +6.8419955e-05 +7.3308154e-05 +7.351188e-05 +0.0012610124 +2.1918344e-05 +5.3881315e-05 +0.000348318 +0.0111174295 +0.0001844288 +0.00023055756 +0.00067666965 +2.1618225e-05 +0.00065558555 +0.00011886986 +0.004878329 +7.532305e-05 +0.00029515053 +0.0008771214 +0.00044318815 +0.00045352246 +2.219967e-05 +3.4630368e-05 +2.1955417e-05 +0.0082423575 +0.02084665 +0.008617819 +2.37336e-05 +0.0007988152 +0.00033299648 +0.00053600385 +0.00012942769 +0.00023972764 +0.00047354214 +0.0029637653 +0.00017331565 +5.8418576e-05 +0.0026522074 +0.00013416266 +0.00024219774 +0.0002707129 +0.0037202735 +0.0004878337 +0.0016466635 +3.0741547e-05 +0.00824405 +0.0016471919 +0.00048588854 +0.00041886864 +0.00038283042 +5.720226e-05 +0.0013508176 +0.00025465732 +6.677686e-05 +0.0031950285 +0.00022743837 +0.0012873787 +0.0019100192 +0.00016512939 +0.0066867983 +0.0025570705 +7.590332e-05 +0.0001290511 +0.0013077843 +0.009066646 +8.278893e-06 +0.00014440181 +0.008204297 +0.0006864818 +0.0008325608 +0.0047303867 +0.00063803 +0.00058498216 +0.007141755 +0.0025759342 +1.5265148e-05 +0.000791608 +0.0002963567 +6.699214e-05 +0.00015540588 +1.9577861e-05 +0.00019148094 +0.0050711925 +0.0003821164 +0.00031181856 +0.02256623 +9.6739546e-05 +0.00022743792 +0.0002277875 +0.00024204118 +9.2040355e-05 +0.006166843 +0.0004336779 +0.0001697661 +0.0033746548 +0.00019502817 +5.0561524e-05 +2.586181e-05 +0.0010798759 +3.664102e-05 +0.00013510302 +0.00016221526 +2.2405515e-05 +0.0014313295 +0.00017091136 +0.0023739443 +6.802837e-05 +0.00064769934 +0.00034750463 +0.00011071275 +1.7708879e-05 +0.00013680755 +2.4237579e-05 +0.0003371289 +0.0006825689 +0.0028515519 +0.00011692811 +0.00022007397 +0.02142835 +0.0017977277 +0.00035943018 +0.001095244 +0.00077389204 +0.0002297276 +0.025019487 +0.0019389915 +0.00033054518 +0.0114699 +5.516768e-05 +0.000209548 +0.00040630833 +2.0364629e-05 +0.00039122297 +0.0020364495 +0.0008940088 +6.6173154e-05 +0.00034862926 +0.0042634625 +2.3698478e-05 +5.9804384e-05 +0.0037845175 +0.00018579431 +0.0011340764 +0.0005943249 +0.00020876242 +0.0001095363 +1.866407e-05 +1.5485472e-05 +8.666633e-05 +0.0040748627 +6.6307715e-05 +0.00070469885 +0.0008672148 +0.0002835482 +0.0002781067 +0.0025088897 +0.0002623553 +0.0002617934 +4.9439703e-05 +0.00010924356 +0.00043568495 +0.002368831 +1.9224659e-05 +0.0015811798 +0.0006842592 +0.0002917136 +0.0003131275 +0.00060534995 +0.0001427105 +8.8764216e-05 +0.001122838 +7.210702e-05 +0.0041576345 +0.00011061608 +0.0007480099 +1.3065656e-05 +4.5712564e-05 +0.0007861731 +0.0003158539 +0.00015036995 +0.0003323501 +0.0012030656 +0.00019688989 +0.00016745002 +0.00024887823 +0.0034065044 +0.0023652983 +0.00031526107 +0.0066307853 +0.00017283301 +0.0022883036 +0.00017895563 +0.00018347587 +0.00035834042 +0.0008326437 +0.0017283945 +0.00035829068 +0.00029964442 +2.0670632e-05 +0.0008355308 +0.00048754443 +0.0017713069 +0.00191648 +1.9209521e-05 +0.005908878 +0.002205918 +0.00039330104 +0.00043703758 +0.0017654483 +0.00013185009 +0.00395082 +0.0001576185 +0.00038202494 +0.0038736896 +0.00041661857 +0.00012902985 +7.777089e-05 +0.0017715484 +0.0023155885 +0.00055541855 +4.9337166e-05 +0.00047428903 +0.00043557858 +0.00069765287 +0.009222093 +0.010263749 +6.7705434e-05 +5.966209e-05 +5.7554716e-05 +8.994978e-06 +0.0009418844 +0.00019504203 +0.000114773786 +0.0004218587 +0.00014428151 +8.655709e-05 +0.0008147674 +0.0008794013 +0.00014804432 +0.0027704514 +8.3283114e-05 +4.3073826e-05 +0.00018634342 +9.9652214e-05 +0.000109504916 +0.0067855045 +0.00015742471 +0.00077502604 +0.0006362351 +0.00046153838 +5.4576325e-05 +0.00017408792 +0.0012021991 +0.009413977 +0.022948345 +0.0010692423 +3.5031127e-05 +5.092194e-05 +6.2689374e-05 +0.0068375845 +0.00027439403 +2.1836517e-05 +0.0002581114 +0.00027914194 +0.00027809184 +3.0986383e-05 +3.457496e-05 +0.0046969666 +0.00046523788 +0.0021990726 +5.2927287e-05 +0.00029199888 +0.0006094933 +0.00014609241 +0.0005544162 +0.0021697562 +3.2796317e-05 +0.00084513065 +0.000516489 +0.0005635408 +6.230352e-05 +0.00054642366 +0.00013715419 +0.00013440092 +0.00011689427 +0.00056491833 +0.00064705784 +0.010491602 +3.0012101e-05 +0.0005605288 +0.0002985542 +6.826285e-05 +0.0013857664 +0.0032425607 +1.2750059e-05 +0.00404577 +0.0050039887 +0.0001610246 +0.0003332945 +0.00028637925 +0.0011893546 +0.00030820677 +0.0022603609 +0.0010670897 +0.00031939565 +4.9374088e-05 +9.66541e-06 +8.219991e-05 +0.00027665813 +6.6826746e-05 +0.0003693902 +7.4780626e-05 +0.00018097041 +0.0014217026 +0.00015682563 +5.9905306e-06 +0.0035234408 +0.0001482323 +0.00035662283 +4.427336e-05 +0.00025081105 +0.00036762984 +0.00013225578 +0.00017834459 +0.0041054576 +8.1886355e-05 +0.0006386442 +0.00016379755 +0.00014210392 +8.108431e-05 +0.0007447243 +8.90168e-05 +9.151607e-05 +0.0005884257 +0.0022961798 +0.00013226802 +0.00066101504 +0.00046616056 +0.00064051466 +0.003273349 +0.00048656296 +0.00022358973 +0.0043424554 +0.0039812205 +0.00028370952 +0.0008125159 +0.0004582208 +0.0012607021 +0.0009775694 +0.00010673987 +6.354423e-05 +0.0003419572 +0.00018932321 +3.1185988e-05 +0.00031975837 +0.00031104262 +7.3926254e-05 +0.0011545917 +8.575014e-05 +0.00023361114 +0.0006610472 +0.0004883716 +0.0003722783 +9.297524e-05 +0.00012120991 +3.4105407e-05 +0.00024642906 +0.000107494736 +6.998423e-05 +1.7957382e-05 +1.0631384e-05 +0.00018812768 +9.721867e-05 +5.1466308e-05 +2.9841798e-05 +5.317565e-05 +4.5402485e-05 +7.383276e-05 +5.9323876e-05 +0.00011473314 +2.5858333e-05 +0.0002425595 +4.3574375e-05 +0.00016768574 +0.00012793462 +7.1418945e-05 +0.00023895786 +0.00017441496 +2.3925382e-05 +0.0007274894 +0.00054904143 +0.0006600553 +0.0003689452 +0.00019176958 +8.68306e-05 +0.00018872788 +9.3901745e-06 +0.0003732282 +9.679007e-05 +5.338826e-05 +8.710209e-05 +0.00010672185 +4.1709736e-05 +7.757896e-05 +1.37239085e-05 +3.7243954e-05 +0.00015834477 +0.0005567674 +0.00032743503 +0.0011654142 +0.00081817544 +0.00024791955 +0.00015350303 +4.055702e-05 +1.2827285e-05 +0.00036997424 +6.42643e-05 +0.00015970865 +0.00030701264 +0.0005480433 +3.475775e-05 +0.0002730317 +0.00013267291 +2.429988e-05 +0.0001434095 +8.784407e-05 +0.00047590246 +3.644311e-05 +0.00023676634 +0.0002182384 +0.000118374526 +0.00029589442 +3.6611822e-05 +1.2448694e-05 +9.5065865e-05 +0.00013185348 +5.2593718e-05 +0.00011015442 +1.475699e-05 +0.00014547075 +0.0006541775 \ No newline at end of file diff --git a/qa/L0_java_resnet/expected_output_data/expected_output_pytorch.txt b/qa/L0_java_resnet/expected_output_data/expected_output_pytorch.txt new file mode 100644 index 0000000000..2ae00703da --- /dev/null +++ b/qa/L0_java_resnet/expected_output_data/expected_output_pytorch.txt @@ -0,0 +1,1000 @@ +-0.30805874 +0.07984302 +-1.1900374 +-1.4836702 +-0.5135901 +0.36827153 +-2.1639166 +-0.8705013 +-1.8812447 +-0.16076666 +0.21684004 +-0.928281 +-1.2953714 +-1.0791287 +-1.444455 +-0.89458805 +-0.09590192 +-1.3098954 +-1.2062448 +-1.2327268 +-1.0658404 +0.9427469 +0.5738615 +-0.27459937 +-1.0188934 +-0.35831845 +-0.18257675 +0.27853626 +0.22089688 +-0.3340493 +-1.979969 +-0.555245 +-1.0804464 +-0.8055694 +-0.0004951467 +-1.8401799 +-0.79792225 +-1.4822828 +1.3656672 +-0.89703584 +-1.0853906 +-1.1591249 +-0.032266144 +0.19187923 +-0.4777367 +0.031621072 +-0.7464974 +-0.10246294 +-1.3072289 +-1.8479855 +-0.86044043 +0.8683053 +-0.13818197 +-0.5942293 +-1.0837044 +-1.5115174 +-1.4216323 +-1.7622145 +-1.3229938 +0.3092505 +-0.91198456 +-1.2568892 +-0.42140645 +0.7647873 +0.096434265 +-0.2201274 +0.20995392 +-1.071132 +0.14306861 +0.7973344 +-0.8894367 +1.6341836 +-0.98152703 +1.1916499 +-1.625073 +0.2928239 +-0.8159483 +-0.19991271 +1.6001159 +1.1522979 +0.5397157 +-0.21569327 +-0.5722878 +-0.2540483 +-1.3144569 +-1.3187109 +-0.6919892 +0.06748002 +-0.16136988 +-0.16745704 +-1.043228 +-0.07053011 +0.6526221 +-0.6888746 +-1.0834798 +-0.76091695 +-0.69209605 +-2.3364725 +0.20736966 +-0.21594861 +-0.5073983 +-0.18135151 +-0.85716504 +0.7947216 +-1.5203276 +-2.1758971 +-1.1328814 +-0.13168834 +-1.00645 +-1.0352936 +-0.7703913 +2.5937598 +-0.18291345 +0.037092943 +-0.8275598 +-1.6695257 +-0.007664892 +1.1827207 +-1.3609017 +-1.6130087 +-0.34498727 +-2.0094082 +-0.3217112 +-1.40436 +-1.0353576 +-0.5387643 +1.3731303 +0.17038514 +1.3134736 +-0.6706346 +-1.2812335 +-1.2500542 +-0.8758088 +-1.2494946 +-1.1121799 +-0.43794972 +-1.3552142 +-0.85109013 +-0.806748 +-1.3894855 +-0.93128216 +-0.5771268 +-0.8600849 +-0.6528389 +-0.96694344 +-0.2790189 +-0.13756554 +0.33111212 +-1.017053 +-0.06247963 +-0.82307434 +0.2321171 +0.5925774 +0.11956272 +-0.39129296 +-0.96967256 +-0.34883505 +-0.32861945 +-0.17424661 +-0.5203654 +-0.05074156 +-0.5735833 +-0.89118445 +0.94264233 +-0.48076403 +0.23871332 +-0.5359333 +-0.17496297 +-0.1825326 +-0.8143634 +-0.25432184 +-0.8875172 +-0.40212584 +-0.4248538 +-1.0707774 +0.28054383 +-0.8788248 +-0.063131236 +-0.13580973 +-0.633922 +-1.0408156 +-0.2155596 +-0.868021 +0.02111919 +-0.8062073 +0.21586944 +-0.84782946 +0.36418468 +0.23975046 +0.07298894 +0.8168585 +-0.37726068 +-0.8602677 +-0.21154118 +-0.06361114 +0.39261663 +-1.0140715 +-1.0971476 +-0.94316417 +-0.12982899 +-0.7508501 +-1.874781 +-0.21622303 +-0.7669267 +-0.42140815 +-1.5047493 +-0.6215693 +-0.2612905 +-0.35666725 +-1.0537395 +-0.38551807 +-0.6064094 +-1.2473556 +-1.0768366 +-0.3829122 +-0.85829455 +0.25932565 +-0.9240785 +-1.4660195 +-1.1539187 +0.5768459 +-0.21287401 +-0.4301784 +-0.27853447 +-0.5630739 +-0.88488144 +-0.6149986 +-1.2260586 +-0.118166335 +-0.30751112 +-1.3458123 +-0.787824 +-0.4979396 +-0.07821896 +-0.47691333 +0.21768509 +-0.28501546 +-0.39360434 +-0.99358493 +-0.44038853 +-1.1004056 +-0.36356282 +-1.4787167 +-0.6785121 +-1.0707904 +-0.60454124 +0.0018921697 +-0.60659164 +-0.7804347 +-0.70279366 +-0.45327887 +-0.5740117 +0.12954347 +-1.0870117 +-0.071922086 +-1.5970279 +-0.12967396 +-0.41402286 +-0.34608856 +-0.45053896 +-0.050228007 +-0.036393084 +0.64593357 +-0.91866577 +-0.79366595 +-0.60279816 +-0.55361813 +-0.9942526 +-0.30023605 +-1.0588075 +-0.1602141 +-1.2761784 +-0.80111355 +-0.7847453 +-0.4366057 +-0.29868704 +-0.11143246 +0.24950753 +-1.0829991 +-0.235288 +-0.56935483 +0.8004865 +-0.15923998 +1.5074099 +0.15986127 +0.42949948 +-1.5360352 +-1.3022994 +-0.621235 +-1.2557826 +-1.6063809 +-0.39241713 +-0.8660014 +0.43634364 +-0.7142573 +-1.8392187 +-0.66524017 +-0.4094579 +-0.55560684 +0.26369932 +-0.2994155 +0.19446117 +0.00012531597 +-0.056575328 +-1.0310686 +-1.1073819 +0.95716393 +-0.039132416 +-0.17284413 +-1.7137713 +1.0318145 +-0.6407014 +-0.20157519 +-0.53714764 +1.3076634 +-0.21518743 +-0.10755904 +-0.6703936 +0.58359814 +0.1296847 +-0.74383837 +-2.052296 +-1.943493 +-1.2419901 +-1.5791146 +-1.7323232 +-0.4647262 +-0.8547239 +-0.5982981 +-1.3872371 +-0.8413639 +0.5059893 +-0.028888466 +-1.0159539 +-0.8781407 +-0.8586551 +-1.5765216 +-0.72110957 +-0.54951406 +-1.0456697 +-0.46384534 +-0.8682762 +-1.329279 +-1.5812683 +-1.1616806 +-0.5591132 +-0.68271846 +-0.6140093 +-0.8487391 +-0.8138591 +0.5194415 +0.8475472 +-1.2317592 +-0.06508279 +0.84332556 +-0.7534412 +-0.061359435 +-0.17108928 +0.029114015 +0.29252198 +-0.99659246 +0.18716425 +-0.48432857 +-0.574279 +-0.149806 +-0.526539 +-1.6839328 +0.298726 +-1.12589 +-1.2416302 +-1.0083416 +-0.1886835 +-1.2171522 +-0.11976431 +-0.2596951 +-1.1662437 +-0.019736286 +-0.4496138 +-0.12932746 +-1.9007655 +-1.2868488 +-1.1776145 +-0.70207584 +-0.99402976 +-1.5353495 +-0.08161428 +-0.7827241 +-0.9851597 +-1.7212214 +0.30599424 +-1.3255223 +-0.78677404 +0.020959575 +-1.938007 +-0.87134534 +-0.4159284 +-1.7782842 +-1.1730373 +0.08866749 +2.0492961 +1.0762362 +-0.007216261 +0.97626513 +-0.74596655 +0.4418997 +1.1642963 +-1.1256992 +-0.95673156 +-0.64594465 +1.9042864 +-2.266134 +0.23068756 +-0.2024498 +2.4514055 +0.17042859 +-2.1488516 +2.029247 +0.08730792 +2.4830267 +1.5408521 +0.22265166 +0.059148587 +0.47132158 +1.0918839 +-0.8906889 +-0.35221744 +1.1023836 +0.71585846 +-0.080300555 +0.7830516 +-0.1106352 +0.3845232 +1.8773831 +0.25839618 +0.01753266 +0.4585675 +-0.556912 +0.18327093 +1.1123434 +0.54181975 +0.2650874 +-0.6121097 +0.6899404 +0.5327037 +-0.4582712 +-0.3463424 +2.0124485 +0.92011964 +-0.21810834 +-0.7299846 +-0.9932826 +2.0011783 +-0.9193375 +0.07095367 +-2.3654485 +-0.13455993 +1.2373506 +0.55914795 +-0.7927128 +0.24315542 +-0.6954934 +0.818556 +2.2227504 +0.50524724 +1.5352136 +2.1754854 +-1.2847167 +-1.7190467 +-1.2820773 +0.48650953 +0.9624859 +-0.28632626 +-1.7782934 +3.3267088 +-0.80292964 +-0.82254416 +-2.4419034 +1.1831589 +0.27238667 +-0.08926326 +0.114699185 +2.2780476 +1.2212758 +-1.5160606 +-0.7004898 +0.46838894 +0.61680245 +1.7088135 +1.709012 +-1.2258106 +0.67940307 +1.9137111 +-0.13307501 +0.8966815 +1.0661377 +-0.077985905 +0.294199 +-1.1051399 +-0.61139315 +3.6302567 +-0.82702637 +0.40620643 +0.898003 +2.2812579 +0.42015857 +0.41871074 +-0.5433154 +2.1934881 +0.44938952 +-2.4096403 +0.3080853 +-0.75909114 +2.749651 +1.273376 +0.88220817 +0.46447915 +0.84428304 +0.5331683 +0.41311303 +0.3472368 +0.42634374 +0.5020205 +1.133693 +0.6315067 +0.49277782 +-1.1333336 +0.5877674 +1.6507065 +0.6192476 +0.6534441 +1.9449492 +0.80630463 +0.57669324 +-0.67982227 +1.7395757 +-0.028182037 +-0.9472996 +-0.8416842 +-0.12939622 +1.2086351 +0.57445955 +0.7767944 +1.280486 +1.2262709 +-0.8028702 +1.0569873 +0.94939137 +-1.4751376 +-0.19903125 +2.3615687 +1.1166264 +2.325268 +0.8368003 +1.1348325 +-0.81748235 +-0.94805723 +1.3997422 +0.48129374 +0.87885517 +1.8402383 +-0.7471128 +0.063835524 +-1.1082904 +0.8763111 +-1.1521848 +-1.3750111 +-0.17355038 +2.084852 +0.0059059784 +-0.9651331 +1.4963127 +-1.2178527 +0.85985076 +-0.04743771 +1.2991531 +-1.2023815 +-0.538383 +1.2776058 +0.44704303 +-0.09368593 +-1.4124348 +-1.66763 +-1.382003 +2.56167 +2.7520278 +1.7802238 +0.20748135 +2.201629 +1.4195694 +1.0006833 +1.2050105 +0.7915465 +0.80263686 +1.54673 +0.29449403 +-0.18094113 +2.7645786 +0.08308226 +0.32472438 +0.41058362 +2.673242 +1.3079755 +0.78823054 +-1.2491844 +2.6995187 +0.3947289 +1.5972215 +-0.2016275 +0.667046 +-1.0026234 +1.5369157 +-0.21158755 +-0.5587798 +1.8455683 +0.18770997 +1.7668104 +1.3544986 +0.5668934 +1.6499695 +0.79549676 +0.23864032 +-0.076060526 +0.54530853 +3.0026731 +-1.3816507 +-0.9419994 +2.1659389 +-0.49469137 +-0.23300627 +2.2649322 +0.6988553 +1.7207134 +1.4296931 +1.8957422 +-1.7843419 +2.108782 +0.63150716 +-1.2306048 +0.4726084 +0.16148792 +-1.1888111 +2.5059545 +0.49573082 +1.0300703 +2.1389406 +-0.6599807 +-0.037568122 +0.94101214 +-0.2563992 +0.37840766 +2.115041 +0.7366525 +0.3634316 +0.93945736 +-0.4147591 +-0.38213915 +-1.2784125 +-0.08756078 +-0.9641913 +0.19105943 +-0.3143284 +-1.6625874 +1.6527823 +-0.5382227 +0.3207345 +-0.595412 +1.5850205 +0.8305495 +-0.8234362 +-0.8500601 +-0.7534717 +-0.9616986 +0.4730339 +1.5510118 +2.668524 +-0.60776836 +1.7700179 +2.7614388 +1.3252912 +0.59501547 +2.1923153 +1.6112024 +-0.40866897 +1.8549836 +2.2821114 +-0.77804285 +1.6713705 +-1.6944448 +0.17435041 +-0.2616872 +-1.3363857 +0.6129463 +0.86893713 +0.6393853 +-1.234884 +1.1132063 +2.0555096 +0.022984732 +-1.0277154 +2.4854038 +1.451681 +1.6226276 +0.67418146 +-0.85724473 +-0.7612631 +-1.2767704 +-1.0986053 +-0.21717405 +1.6196754 +0.6333269 +1.2900922 +1.2161998 +0.36294502 +1.5778857 +1.6918045 +0.99078727 +-0.45147473 +-1.2807459 +0.045685403 +1.0520277 +1.9152287 +-1.3029758 +0.9261474 +0.7156784 +-0.19225252 +0.55643463 +2.0766673 +-0.18557347 +0.13493066 +1.802568 +0.23648183 +2.766143 +0.2725357 +1.0387229 +-1.9429945 +0.23742795 +0.54052275 +0.2342531 +0.132205 +0.82999367 +1.7976496 +0.49230877 +0.7958189 +-0.37094918 +1.110652 +0.6413396 +1.1133307 +1.7305324 +0.37832874 +2.2200847 +-0.36919576 +-0.9609986 +0.19756792 +1.3253196 +1.8076504 +0.103227235 +-0.42585406 +-1.348184 +1.8132821 +1.2306423 +1.1028852 +1.9165587 +-2.4476745 +2.054153 +1.682224 +0.44401717 +0.19734457 +1.5318341 +-0.47473955 +2.3914623 +0.42040017 +0.6056829 +2.4316716 +0.34631512 +1.3324567 +0.0011816069 +1.1105287 +1.4553503 +1.7634965 +-0.6814372 +0.2123078 +0.16176923 +1.0453559 +2.9997826 +2.2626696 +-0.76536435 +-0.42744967 +0.14685751 +-2.1144905 +0.90889215 +1.048776 +-0.1111255 +1.91633 +0.45815408 +0.054494135 +0.420825 +0.21111344 +1.0745884 +1.3172199 +-0.20259683 +-0.77705085 +-0.0074540502 +-0.3671591 +-0.33085522 +1.9708865 +-0.57260597 +0.46406755 +-0.46640325 +-0.46216512 +-0.59125966 +0.87914044 +0.7298775 +1.101785 +3.035671 +-0.35254276 +-0.86594146 +-0.80589545 +-0.7337217 +1.8224323 +-1.2016355 +-0.72215164 +-0.47425175 +0.3528979 +1.0273298 +-0.036939412 +0.2297522 +2.528665 +0.3788014 +1.9056299 +-1.8528597 +1.3645221 +1.9897952 +-0.32049844 +0.20599015 +1.1722815 +-0.74404633 +1.4928225 +0.8872909 +1.4359131 +-0.72126484 +1.1888711 +1.0988497 +0.34612125 +-1.1861738 +2.339421 +1.8755157 +2.8820977 +-0.5806484 +0.39929 +-0.2774235 +-0.27243808 +1.1287675 +1.7444426 +-0.59589016 +1.1558293 +1.2643657 +-0.024029814 +0.23252903 +-0.2631906 +0.82813776 +-0.01724714 +1.3382394 +0.8137164 +0.0848312 +-0.667315 +-2.0700092 +-0.4838388 +-0.51320595 +0.0037372224 +1.3113365 +-0.22582024 +-0.48156402 +1.6307961 +0.09801248 +-2.1774163 +0.64898616 +0.19490883 +0.1979113 +0.5982482 +0.08691002 +0.46526366 +0.80410117 +0.7230205 +1.8608608 +-0.79288054 +1.1912636 +-0.38980532 +-0.44946012 +-0.18038842 +0.37972292 +-1.0056939 +1.2174432 +2.4348667 +0.66281396 +1.1692165 +0.5451535 +1.162487 +1.303657 +1.1611288 +1.4010147 +0.04817031 +1.7428269 +3.0368202 +0.8766508 +-0.26485524 +0.26849088 +1.869811 +0.48758182 +-0.6030314 +0.14393385 +0.57609755 +0.5643001 +-1.2467934 +0.17159785 +-0.56257993 +-1.4873617 +2.5040245 +-0.57016486 +-0.56566435 +0.13093448 +0.35735604 +0.5589198 +-0.28002298 +-0.20552874 +-1.1545538 +-0.12005596 +-0.63608867 +-0.5422438 +-1.5786606 +-0.08732763 +0.26583073 +-0.48822308 +-0.61887413 +-2.0053678 +-0.8047017 +-0.78162575 +-0.06668275 +0.49894157 +0.15497255 +-0.7863977 +0.6278491 +-0.9034021 +0.19300902 +0.026619527 +-0.3625757 +0.51064104 +-0.40118733 +-1.2872294 +1.4680091 +1.5331635 +-0.0104825385 +0.7074813 +0.47988775 +-0.15154226 +0.9793232 +-0.8414473 +0.6749984 +0.3124825 +0.027812386 +-0.59152645 +-0.05568023 +-0.7404828 +-0.5500867 +-1.7206669 +-0.7042971 +-1.0925202 +1.581233 +-0.121507704 +0.8914928 +0.9794418 +-1.1422362 +-0.12346666 +-0.5999273 +-2.1338222 +-0.077511735 +-0.8373626 +-0.23501818 +-0.010404997 +-0.041594535 +-1.0295677 +-0.29143637 +-0.22416036 +-0.8062624 +-0.7818173 +-0.2714035 +0.00018124096 +-1.2354704 +0.123760514 +0.018292539 +-0.6903522 +0.52160364 +-1.8007841 +-1.782615 +-1.2970004 +-1.6565065 +-1.3305808 +-0.6563534 +-1.6530751 +0.117775925 +0.24357137 \ No newline at end of file diff --git a/qa/L0_java_resnet/expected_output_data/expected_output_tensorflow.txt b/qa/L0_java_resnet/expected_output_data/expected_output_tensorflow.txt new file mode 100644 index 0000000000..d017d7d60b --- /dev/null +++ b/qa/L0_java_resnet/expected_output_data/expected_output_tensorflow.txt @@ -0,0 +1,1001 @@ +0.00070911006 +0.0010684511 +0.0002289149 +0.0002890797 +0.001823506 +0.00033588437 +0.0005761559 +0.00026887475 +0.00016327911 +0.00062107155 +0.00035215134 +0.00021309333 +0.0002824714 +0.00032690517 +0.000362966 +0.00029754156 +0.000462734 +0.0009069857 +0.00024187386 +0.00022825644 +0.0005646942 +0.0005685028 +0.0015051479 +0.000550871 +0.00035833745 +0.0007460652 +0.00018980923 +0.0006296634 +0.0009744452 +0.0004044121 +0.00021716364 +0.003566736 +0.00033353135 +0.00038591775 +0.0012752721 +0.00010569831 +0.0002329158 +7.213156e-05 +0.0042858184 +0.0008237876 +0.0010394026 +0.00012532603 +0.00022559383 +0.00018184909 +0.00024319398 +0.0005497621 +0.0010193866 +0.0012020781 +0.0002604365 +0.00036887883 +0.00039009948 +0.0005622609 +0.0005074424 +0.00065419363 +0.0001678674 +0.0007651498 +0.00019579448 +0.000100849866 +0.00060587144 +0.009335775 +0.002238217 +0.00042261003 +0.0004869275 +0.0017416928 +0.00050716975 +0.0003331386 +0.0009492363 +0.00026299703 +0.00096314494 +0.0002454126 +0.00052854954 +0.0022881972 +9.451885e-05 +0.0020056511 +0.00017017504 +0.00013614705 +0.00031952796 +0.0006581821 +0.00086781656 +0.0010920991 +0.00016639908 +0.00029970525 +0.00036486977 +8.347438e-05 +0.00027294483 +0.00027506787 +0.00014957263 +0.00012473388 +0.00047103016 +0.00068512204 +0.00026231256 +0.0002471854 +0.00038985602 +0.0005510145 +0.0015379117 +6.391459e-05 +0.00075941073 +0.00021282899 +0.00016255074 +0.0006057964 +0.00034061813 +0.000116008356 +0.00013254896 +0.00072937884 +0.00025322058 +0.00013424494 +0.000116978124 +0.0012361282 +0.0008600386 +0.00016587735 +0.0008544744 +0.048096422 +0.0014968353 +0.0025525852 +0.0003895892 +0.0004779505 +0.0010679476 +0.001583124 +6.0117403e-05 +0.00023506799 +0.00080862094 +9.170748e-05 +0.0003459704 +0.00025960154 +0.00032231968 +0.00024193742 +0.0005336417 +0.0003331181 +0.00083348254 +0.0005098401 +0.00050219166 +0.0001382532 +0.0013238905 +0.00024549733 +0.00018288675 +0.00032616325 +0.00016282972 +0.00012510039 +0.00037040826 +0.00023140096 +0.00033143876 +0.00035791306 +0.00014701433 +0.00016613651 +0.0007882612 +0.00020208673 +0.00025587558 +0.0005947067 +0.00020411932 +0.0003501464 +0.00019414612 +0.00036807868 +0.00016704168 +0.00031899076 +0.00014016406 +0.0001590781 +0.0001042989 +6.693639e-05 +0.00044032355 +0.00019823047 +0.0001648452 +0.00021023075 +0.000121602214 +0.0008859733 +0.00027336556 +0.00021329267 +0.00042354263 +0.00015121586 +0.000366059 +0.00013732535 +0.00029352968 +0.00021702389 +0.00028322692 +0.00041577345 +0.00022989941 +0.00022801253 +0.00016557571 +0.00020442168 +0.00084447116 +0.00024891275 +0.0002122566 +0.00030452234 +7.565878e-05 +0.00012686373 +0.00019746723 +0.00032517608 +0.00019016817 +0.00029626995 +0.00016989792 +0.00049037643 +0.00020838893 +0.00019873244 +9.189098e-05 +0.0006875357 +0.00064732507 +0.00034183732 +0.00015014365 +0.00011403188 +0.000537032 +0.0003341667 +0.00029259248 +0.00038738886 +0.00012182328 +0.00051590457 +0.00033943634 +9.197326e-05 +0.00039432684 +0.00014883016 +0.00045966395 +0.00023865228 +7.6960285e-05 +0.00014399357 +0.0003608486 +0.00025755627 +0.00020178013 +0.0003600289 +0.0011284449 +0.0001712409 +0.00019862416 +0.00025335004 +0.0001756047 +0.00034503645 +0.00039285867 +0.00017203313 +0.0012871717 +0.00030436684 +0.00024817986 +0.0010085882 +0.00027581956 +0.00028622823 +0.0002573273 +0.00038505017 +0.00039457608 +0.0002494052 +0.00018972508 +0.0003315194 +0.00022963689 +7.301075e-05 +0.00023747115 +0.00032635694 +0.00021661345 +0.00034653372 +0.00018944537 +0.0002243273 +0.0003466119 +7.474429e-05 +0.00029931893 +0.00026417332 +0.000116994954 +0.0002012358 +0.0004963594 +0.00027601913 +0.00023313782 +0.00021496546 +0.00033204685 +0.00038143748 +0.00010215905 +0.00022710346 +0.0004710895 +0.00010912214 +0.00067364104 +0.0002553266 +0.00024328758 +0.00018621673 +0.00024005111 +3.6619393e-05 +0.00031510097 +0.00025127587 +0.00020713067 +0.00053867674 +0.0004486591 +0.00012326887 +0.00013776327 +0.00010066613 +0.0001907201 +0.00019176993 +0.00028617049 +0.00043150192 +0.00022882965 +0.00017046132 +0.0001404705 +0.0003074807 +0.00069475063 +0.0005420082 +0.00016548761 +0.0011550415 +0.0003579725 +0.00013039725 +0.00046354206 +0.00025531164 +0.00015127688 +0.0003076982 +9.368715e-05 +0.000253574 +0.0004157336 +0.00025558594 +0.00020862755 +0.0003325044 +0.00010430214 +0.0005750662 +0.00034912725 +0.0003502339 +0.00013765084 +0.0011814896 +0.0007353515 +0.0004288803 +0.0010895525 +0.0021925315 +0.0010849636 +0.0002088477 +0.000698407 +0.0005413023 +0.0025422976 +0.00050733547 +0.00056180026 +0.0032103728 +0.00023816712 +0.0017631998 +0.003166806 +0.00075065246 +0.00043124682 +0.00020120693 +0.00030978755 +0.00040472345 +0.0010322309 +0.0002756523 +0.0007263063 +0.00038796544 +0.0014804546 +0.00025164674 +0.00021415394 +0.00015569745 +0.00047274903 +0.00026750995 +9.396422e-05 +0.0003232726 +0.0003681733 +0.00017011825 +0.00037481345 +0.000110637375 +0.00027844915 +0.00027941877 +0.00028294954 +0.000107615866 +0.00013299155 +0.00025102712 +0.0003521134 +0.00018762982 +0.0005306597 +0.00027527596 +0.0001893789 +0.0006203038 +0.0002596028 +8.3349165e-05 +0.000421517 +0.00033665064 +0.00045308896 +0.000110814566 +0.00016861226 +0.0006383047 +0.00020831541 +0.00014839825 +0.00029492003 +0.00019427085 +0.00045692816 +0.00020844795 +0.00019500752 +0.00040315292 +8.695124e-05 +0.00013987756 +0.00012228725 +0.00056897226 +0.00020290921 +0.0002687522 +0.00023272065 +0.00015077695 +0.0004568092 +0.00052215316 +0.00027182538 +0.00020620856 +0.0010283174 +8.266399e-05 +0.00021341672 +0.00019470627 +0.0004475956 +0.00043766637 +0.00018623582 +0.00022168642 +0.00027278156 +0.00027336203 +0.00034579786 +9.910105e-05 +0.00036059332 +0.0005613833 +0.00021642471 +0.00061176467 +0.00032723378 +0.0007215444 +0.00042581535 +0.006056687 +0.00015225813 +0.0038606655 +0.0033682694 +0.0005007813 +0.00034089078 +0.001088126 +0.0003091816 +0.00025670388 +0.00028364526 +0.0039907284 +8.005619e-05 +0.0003177985 +0.00044217892 +0.003775855 +0.00022793307 +9.9455334e-05 +0.0042361487 +0.0015110963 +0.0014649354 +0.00076693745 +0.00014660056 +0.0008259513 +0.00014898773 +0.00094022823 +0.0018079237 +0.00027478446 +0.0008579107 +0.00027007243 +0.00027866405 +0.0021426745 +0.00030444653 +0.00013589527 +0.0025529363 +0.00022925495 +0.00020205135 +0.0006399196 +0.0001175159 +0.0008898152 +0.0007308672 +0.00015426724 +0.00070449885 +0.00063714065 +0.0011764771 +0.000113688315 +0.00025997663 +0.0002751466 +0.0012629845 +0.00061876763 +0.00047713597 +0.00018022317 +0.000112102745 +0.0019180076 +0.00014341537 +0.00038212672 +0.00023863306 +0.00014654716 +0.0009910533 +0.00046345408 +0.0006146838 +0.0022888945 +0.00014176867 +0.0009656023 +0.0007254071 +0.0003110353 +0.00075938756 +0.0017488213 +0.00026165575 +0.00043671884 +0.00025007708 +0.0010205496 +0.0072930735 +0.00079188804 +0.00014444374 +0.0022240074 +9.0894464e-05 +0.0005548176 +0.00036375815 +0.00045969928 +0.00049831875 +0.0006171517 +0.0005445464 +0.0005370632 +0.0009902638 +0.0005372154 +0.00047807358 +0.0018499892 +0.00092412543 +0.006397552 +0.0046642385 +0.00015648386 +0.0003896425 +0.0050082384 +0.0003178785 +0.00040727912 +0.0012690715 +0.00029073926 +0.00041457833 +0.0022713607 +0.0026651558 +0.0043892586 +0.0002917294 +0.0015015705 +0.0002936945 +0.0011139546 +0.0022272936 +0.0006511537 +0.0008047797 +0.0006209673 +0.0012966822 +0.000117934265 +0.0003287383 +0.00011685335 +0.00408869 +0.0020391766 +0.0005868179 +0.00081892085 +0.0008156648 +0.0029200844 +0.0005166022 +0.0005672602 +0.0001692095 +0.0003508818 +0.00013667026 +0.0019258707 +0.0002646609 +0.000203857 +0.0004557036 +0.0014699163 +0.00075061055 +0.00027520838 +0.024521487 +0.0023796572 +0.00031702282 +0.00016261516 +0.0030187324 +0.0001344725 +0.00052194105 +0.00040833795 +0.00073826866 +0.0013697975 +0.00053330604 +0.00047440815 +0.002975715 +0.0034163008 +0.00039923526 +0.0003814433 +0.0033519045 +0.00018409718 +0.00015521496 +0.0012500272 +0.000352364 +0.0026179687 +0.0009001603 +0.0007409122 +0.00020439974 +0.0001491525 +0.00057623035 +0.00053739885 +0.001619859 +0.0007606476 +0.00043201642 +0.00048651415 +0.001913164 +0.001860702 +7.184188e-05 +0.0003567602 +0.00047219777 +0.0013026253 +0.0005371198 +0.0003526595 +0.0010887473 +0.00021295802 +0.0015989357 +0.00016607663 +0.002568109 +0.0002009078 +0.00010516546 +0.00071283046 +0.0003078614 +0.0021156948 +0.00077290024 +0.00027787217 +0.00018751186 +0.0016615124 +0.0015545615 +0.0010933804 +0.0001293072 +0.0012888343 +0.00020816487 +0.00030583332 +0.00016422627 +0.00015186946 +0.00031760518 +0.003668799 +0.0008204296 +0.0006058452 +0.0075512384 +0.0006543231 +0.0003984883 +0.0004991135 +0.0063148434 +0.0004667902 +0.0019243147 +0.00026864174 +0.004626689 +0.0016829795 +0.0024464321 +0.002604262 +0.0005715485 +0.0004827969 +0.00059977506 +0.00044812242 +0.00018801834 +0.0014922172 +0.00039306682 +0.00038797187 +0.0017823273 +7.7641904e-05 +0.0013096565 +0.0033977008 +0.0014362672 +0.0010601832 +0.0016821629 +0.0041754427 +0.00036547767 +0.00034212973 +0.04761514 +0.00039928395 +0.0007339863 +0.0048003797 +0.00032377243 +0.0006853962 +0.0019343331 +0.0021214003 +8.4536754e-05 +0.0014679983 +9.906235e-05 +0.0001737739 +0.00044015897 +5.232733e-05 +0.0003227811 +0.0011037658 +0.0009596574 +0.002163132 +0.034116793 +0.00018434737 +0.00054400944 +0.00027010517 +0.00029613025 +0.0002854188 +0.008274664 +0.0026966897 +0.00056778896 +0.00056742143 +0.0001424069 +0.00021398348 +0.0002040955 +0.0007528397 +0.00047215613 +0.0003180315 +0.00026779302 +0.00017190988 +0.00057392224 +0.00026870312 +0.0041729347 +0.00022995795 +0.00473034 +0.00053698535 +0.0015700939 +8.663364e-05 +0.00037708133 +0.00010627266 +0.0008188108 +0.0013689178 +0.0028652248 +0.00030012682 +0.00019088034 +0.0020974467 +0.0005804101 +0.0046054157 +0.000866855 +0.0028432102 +0.0004053386 +0.0022837527 +0.00031697293 +0.0020557377 +0.0006195521 +0.00029529422 +0.0019667863 +0.00028010362 +0.00036917007 +0.0014461187 +0.0010241494 +0.00035407842 +0.0007762103 +0.0007345563 +0.0016735821 +0.000100398545 +0.00042761158 +0.0018091354 +0.0011984855 +0.00054059736 +0.0010517223 +0.0003952099 +0.0004072673 +5.0896953e-05 +0.00015406184 +0.0011205417 +0.0016784162 +6.48444e-05 +0.001374897 +0.0049680024 +0.00031736813 +0.00040638892 +0.0031774077 +0.00014365144 +0.00058315735 +9.539311e-05 +0.0002490495 +0.00080948864 +0.0026334277 +5.1187024e-05 +0.0019501996 +0.00017581039 +0.0007018262 +0.00082990975 +0.00033347218 +0.0003785377 +0.00024977518 +0.0006290335 +0.0005053414 +0.001499565 +0.0002951073 +0.00053611986 +0.00018856855 +0.00011126017 +0.0019289504 +0.0006362068 +0.000522457 +0.00032152023 +0.0018640001 +0.0008822051 +0.0009148322 +0.0009896222 +0.0029765042 +0.0014977105 +0.0003173049 +0.0015661103 +0.00010378374 +0.0067265066 +0.0005495047 +0.00020958934 +0.00019278725 +0.0009433383 +0.0026177543 +0.00051816285 +0.00017156888 +7.744175e-05 +0.0003151731 +0.0008290297 +0.0032181763 +0.0024396458 +0.00025281956 +0.0029372664 +0.0014309491 +0.00055660465 +0.0007385025 +0.0009333291 +0.0002543238 +0.0060301092 +0.00057014904 +0.0013402926 +0.0027256922 +0.0009102879 +0.0001869125 +8.260008e-05 +0.0039338632 +0.0023134286 +0.0012300106 +0.00029246748 +0.000283189 +0.00026828857 +0.0025049848 +0.0016384326 +0.0022900025 +0.0002599975 +0.0004017206 +0.0016243177 +0.0006216647 +0.0036319585 +0.00028053645 +0.0004719083 +0.00096298783 +0.00025558157 +0.00021045441 +0.00043856484 +0.00095168 +0.0002192634 +0.00033050985 +0.00012919637 +0.00022991112 +0.00042593313 +0.00029524197 +0.0003437868 +0.0051064813 +0.0005583069 +0.0007269702 +0.00024129995 +0.00030284372 +0.00027721547 +0.0003213354 +0.0006788763 +0.0012024492 +0.0036741009 +0.00024671428 +0.0005882029 +5.3842294e-05 +0.00040663296 +0.02228713 +0.0016194598 +0.00015659895 +0.00037711856 +0.00040618918 +0.0011397398 +0.00011992812 +0.0001520243 +0.0048938077 +0.0016474533 +0.0019597847 +0.00048948237 +0.00030241054 +0.00049067725 +0.00073232397 +0.00032315947 +0.0014954191 +0.00037097387 +0.0013783753 +0.0016116645 +0.00029578464 +0.00090505433 +0.00027435934 +0.0005812986 +0.000120840086 +0.00039883642 +0.0015213061 +0.0027571726 +0.0023957484 +0.00019108997 +0.0007307167 +0.000956605 +0.0006839414 +0.0024526927 +0.007934612 +0.00020379393 +0.015423247 +0.001909548 +0.000276556 +0.00094950746 +0.00063008594 +0.0019207522 +0.00024915 +0.00062654825 +0.0019300466 +0.00035208502 +0.00028049122 +0.0003148443 +0.00038308708 +0.00027527692 +0.00026734636 +0.000109911365 +0.00015939883 +0.00020454325 +0.0014520382 +0.0005228617 +0.00011064936 +0.003540477 +0.00031232936 +0.00044735873 +0.00017807365 +0.0013564116 +0.000965749 +0.0010829738 +0.00073439174 +0.0027080632 +0.00030311418 +0.00044519626 +0.0007992933 +0.00032909622 +0.00030226275 +0.0029641816 +0.00011622985 +0.0007482988 +0.001229003 +0.0025723213 +0.00065770274 +0.00015693594 +0.00054296193 +0.0013329909 +0.002655394 +0.00034390666 +0.00031026872 +0.0020210485 +0.0008697185 +0.00032176377 +0.0041055335 +0.0057543945 +0.00040670217 +0.0005435844 +0.009029863 +0.00028603026 +0.00064405525 +9.242199e-05 +6.4520485e-05 +0.00018704256 +0.00015222837 +0.00019523445 +0.005567865 +6.787147e-05 +0.00034305613 +0.0028331447 +0.0020781667 +0.00010261523 +0.0002362934 +0.00013399884 +0.00022745578 +0.00025935622 +0.00031119035 +0.00038356654 +0.00022390902 +0.00047898493 +0.0004629675 +0.000112182315 +0.00013342654 +0.00018693593 +0.00046389582 +0.00042846476 +0.00045707394 +0.00045862008 +0.00034546596 +8.175569e-05 +0.00023262479 +0.00021009706 +0.00047855324 +0.00030753214 +0.00019426928 +0.0010725219 +0.0003141107 +0.0005669363 +0.0012055356 +0.001431565 +0.0007926821 +0.0008843769 +0.0005278664 +0.00042725797 +0.003944173 +0.00015261356 +0.000299945 +0.00079040887 +0.00060629344 +0.00020051922 +0.00031456698 +0.00040859287 +0.00027128076 +0.00021296159 +8.693237e-05 +0.00027029635 +0.00305675 +0.0023890452 +0.003111028 +0.0006668401 +0.0004029482 +0.0032200122 +0.00013293372 +0.0007656965 +0.00023606456 +0.0003478867 +0.00031042635 +0.00016308061 +0.00038783776 +0.00043370973 +0.00089249195 +4.2713556e-05 +0.0004966322 +0.0016314207 +0.0004260099 +0.0017055604 +0.00043873576 +0.0004356743 +0.00071425876 +0.00013353773 +0.00031172932 +0.00033197878 +0.00043404778 +0.00013681914 +0.00016265325 +0.000201886 +0.000113467126 +0.000118104785 +0.0006379289 +0.0009817044 +0.00019666742 \ No newline at end of file diff --git a/qa/L0_java_resnet/test.sh b/qa/L0_java_resnet/test.sh new file mode 100755 index 0000000000..1ca08b4c65 --- /dev/null +++ b/qa/L0_java_resnet/test.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# Models +DATADIR=/data/inferenceserver/${REPO_VERSION} +MODEL_REPO=`pwd`/models +JAVACPP_BRANCH=${JAVACPP_BRANCH:="https://github.com/bytedeco/javacpp-presets.git"} +JAVACPP_BRANCH_TAG=${JAVACPP_BRANCH_TAG:="master"} + +# Create local model repository +mkdir -p ${MODEL_REPO} +# TODO: fix build to support GPU only resnet50v1.5_fp16_savedmodel +for BACKEND in _fp32_libtorch _fp32_onnx; do + cp -r $DATADIR/perf_model_store/resnet50${BACKEND} ${MODEL_REPO}/ + echo ${MODEL_REPO}/resnet50${BACKEND}/config.pbtxt + sed -i "s/kind: KIND_GPU/kind: KIND_CPU/" ${MODEL_REPO}/resnet50${BACKEND}/config.pbtxt +done + +# Set up test files based on installation instructions +# https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/README.md +set -e +git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git +source client/src/java-api-bindings/scripts/install_dependencies_and_build.sh -b $PWD --javacpp-branch ${JAVACPP_BRANCH} --javacpp-tag ${JAVACPP_BRANCH_TAG} --keep-build-dependencies +cd .. + +CLIENT_LOG="client.log" +SAMPLES_REPO=`pwd`/javacpp-presets/tritonserver/samples/simple +BASE_COMMAND="mvn clean compile -f $SAMPLES_REPO exec:java -Djavacpp.platform=linux-x86_64" +source ../common/util.sh + +cp ResnetTest.java $SAMPLES_REPO +sed -i 's/Simple/ResnetTest/g' $SAMPLES_REPO/pom.xml + +rm -f *.log +RET=0 + +# Run with default settings +$BASE_COMMAND -Dexec.args="-r $MODEL_REPO" >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +# TODO: fix build to support GPU only resnet so can test TF as well +for BACKEND in ONNX TORCH; do + if [ `grep -c "${BACKEND} test PASSED" ${CLIENT_LOG}` != "1" ]; then + echo -e "\n***\n*** ${BACKEND} backend test FAILED. Expected '${BACKEND} test PASSED'\n***" + RET=1 + fi +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_java_sequence_batcher/SequenceTest.java b/qa/L0_java_sequence_batcher/SequenceTest.java new file mode 100644 index 0000000000..cfce3584de --- /dev/null +++ b/qa/L0_java_sequence_batcher/SequenceTest.java @@ -0,0 +1,665 @@ +// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import static org.bytedeco.tritonserver.global.tritonserver.*; + +import com.google.gson.*; +import java.io.*; +import java.util.*; +import java.util.concurrent.*; +import org.bytedeco.javacpp.*; +import org.bytedeco.tritonserver.tritonserver.*; + +public class SequenceTest { + // Boilerplate code for setting up Triton + static void FAIL(String MSG) + { + System.err.println("Failure: " + MSG); + System.exit(1); + } + + static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) + { + if (err__ != null) { + System.err.println( + "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - " + + TRITONSERVER_ErrorMessage(err__)); + TRITONSERVER_ErrorDelete(err__); + System.exit(1); + } + } + + static int requested_memory_type = TRITONSERVER_MEMORY_CPU; + + static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server { + public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) + { + super(p); + deallocator(new DeleteDeallocator(this)); + } + protected static class DeleteDeallocator + extends TRITONSERVER_Server implements Deallocator { + DeleteDeallocator(Pointer p) { super(p); } + @Override public void deallocate() { TRITONSERVER_ServerDelete(this); } + } + } + + static void Usage(String msg) + { + if (msg != null) { + System.err.println(msg); + } + + System.err.println( + "Usage: java " + SequenceTest.class.getSimpleName() + " [options]"); + System.err.println("\t-m [model name]"); + System.err.println("\t-v Enable verbose logging"); + System.err.println("\t-r [model repository absolute path]"); + + System.exit(1); + } + + static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t { + @Override + public TRITONSERVER_Error call( + TRITONSERVER_ResponseAllocator allocator, String tensor_name, + long byte_size, int preferred_memory_type, + long preferred_memory_type_id, Pointer userp, PointerPointer buffer, + PointerPointer buffer_userp, IntPointer actual_memory_type, + LongPointer actual_memory_type_id) + { + // Initially attempt to make the actual memory type and id that we + // allocate be the same as preferred memory type + actual_memory_type.put(0, preferred_memory_type); + actual_memory_type_id.put(0, preferred_memory_type_id); + + // If 'byte_size' is zero just return 'buffer' == nullptr, we don't + // need to do any other book-keeping. + if (byte_size == 0) { + buffer.put(0, null); + buffer_userp.put(0, null); + System.out.println( + "allocated " + byte_size + " bytes for result tensor " + + tensor_name); + } else { + Pointer allocated_ptr = new Pointer(); + actual_memory_type.put(0, requested_memory_type); + + actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU); + allocated_ptr = Pointer.malloc(byte_size); + + // Pass the tensor name with buffer_userp so we can show it when + // releasing the buffer. + if (!allocated_ptr.isNull()) { + buffer.put(0, allocated_ptr); + buffer_userp.put(0, new BytePointer(tensor_name)); + System.out.println( + "allocated " + byte_size + " bytes in " + + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + + " for result tensor " + tensor_name); + } + } + + return null; // Success + } + } + + static class ResponseRelease + extends TRITONSERVER_ResponseAllocatorReleaseFn_t { + @Override + public TRITONSERVER_Error call( + TRITONSERVER_ResponseAllocator allocator, Pointer buffer, + Pointer buffer_userp, long byte_size, int memory_type, + long memory_type_id) + { + BytePointer name = null; + if (buffer_userp != null) { + name = new BytePointer(buffer_userp); + } else { + name = new BytePointer(""); + } + + System.out.println( + "Releasing buffer " + buffer + " of size " + byte_size + " in " + + TRITONSERVER_MemoryTypeString(memory_type) + " for result '" + + name.getString() + "'"); + Pointer.free(buffer); + name.deallocate(); + + return null; // Success + } + } + + static class InferRequestComplete + extends TRITONSERVER_InferenceRequestReleaseFn_t { + @Override + public void call( + TRITONSERVER_InferenceRequest request, int flags, Pointer userp) + { + // We reuse the request so we don't delete it here. + } + } + + static class InferResponseComplete + extends TRITONSERVER_InferenceResponseCompleteFn_t { + @Override + public void call( + TRITONSERVER_InferenceResponse response, int flags, Pointer userp) + { + if (response != null) { + // Send 'response' to the future. + futures.get(userp).complete(response); + } + } + } + + static ConcurrentHashMap< + Pointer, CompletableFuture> futures = + new ConcurrentHashMap<>(); + static ResponseAlloc responseAlloc = new ResponseAlloc(); + static ResponseRelease responseRelease = new ResponseRelease(); + static InferRequestComplete inferRequestComplete = new InferRequestComplete(); + static InferResponseComplete inferResponseComplete = + new InferResponseComplete(); + + static TRITONSERVER_Error ParseModelMetadata( + JsonObject model_metadata, boolean[] is_torch_model) + { + String seen_data_type = null; + for (JsonElement input_element : + model_metadata.get("inputs").getAsJsonArray()) { + JsonObject input = input_element.getAsJsonObject(); + if (!input.get("datatype").getAsString().equals("INT32")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "sequence qa example only supports model with data type INT32"); + } + if (seen_data_type == null) { + seen_data_type = input.get("datatype").getAsString(); + } else if (!seen_data_type.equals(input.get("datatype").getAsString())) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "the inputs and outputs of sequence model must have the data type"); + } + } + for (JsonElement output_element : + model_metadata.get("outputs").getAsJsonArray()) { + JsonObject output = output_element.getAsJsonObject(); + if (!output.get("datatype").getAsString().equals("INT32")) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "sequence qa example only supports model with data type INT32"); + } else if (!seen_data_type.equals(output.get("datatype").getAsString())) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + "the inputs and outputs of sequence' model must have the data type"); + } + } + + is_torch_model[0] = + model_metadata.get("platform").getAsString().equals("pytorch_libtorch"); + return null; + } + + // Custom function to set metadata required for sequence batcher + static void SetSequenceMetadata( + TRITONSERVER_InferenceRequest irequest, long correlation_id, + boolean sequence_start, boolean sequence_end) + { + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetCorrelationId(irequest, correlation_id), + "Unable to set correlation ID"); + int flags = 0; + if (sequence_start) { + flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_START; + } + if (sequence_end) { + flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_END; + } + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetFlags(irequest, flags), + "Unable to set flags"); + } + + // Custom function for adjusting sequence batcher + // expected results for backends that do not implement + // full accumulator + static int GetExpectedResult( + String model_name, int expected_result, int value, String flag) + { + if ((!model_name.contains("nobatch") && !model_name.contains("custom")) + || model_name.contains("graphdef") || model_name.contains("plan") + || model_name.contains("onnx") || model_name.contains("libtorch")) { + expected_result = value; + if (flag != null && flag.contains("start")) { + expected_result++; + } + } + return expected_result; + } + + // Standard function for checking response parameters, + // plus customized check that final sequence result + // "out" matches expected result + static void Check( + String model_name, TRITONSERVER_InferenceResponse response, + int input_value, String output0, long expected_byte_size, + int expected_datatype, boolean sequence_end, int expected_result) + { + HashMap output_data = new HashMap<>(); + + int[] output_count = {0}; + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutputCount(response, output_count), + "getting number of response outputs"); + if (output_count[0] != 1) { + FAIL("expecting 1 response outputs, got " + output_count[0]); + } + + for (int idx = 0; idx < output_count[0]; ++idx) { + BytePointer cname = new BytePointer((Pointer) null); + IntPointer datatype = new IntPointer(1); + LongPointer shape = new LongPointer((Pointer) null); + LongPointer dim_count = new LongPointer(1); + Pointer base = new Pointer(); + SizeTPointer byte_size = new SizeTPointer(1); + IntPointer memory_type = new IntPointer(1); + LongPointer memory_type_id = new LongPointer(1); + Pointer userp = new Pointer(); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseOutput( + response, idx, cname, datatype, shape, dim_count, base, byte_size, + memory_type, memory_type_id, userp), + "getting output info"); + + if (cname.isNull()) { + FAIL("unable to get output name"); + } + + String name = cname.getString(); + if (!name.equals(output0)) { + FAIL("unexpected output '" + name + "'"); + } + + if ((dim_count.get() != 1) || (shape.get(0) != 1)) { + FAIL("unexpected shape for '" + name + "'"); + } + + if (datatype.get() != expected_datatype) { + FAIL( + "unexpected datatype '" + + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name + + "'"); + } + + if (byte_size.get() != expected_byte_size) { + FAIL( + "unexpected byte-size, expected " + expected_byte_size + ", got " + + byte_size.get() + " for " + name); + } + + if (memory_type.get() != requested_memory_type) { + FAIL( + "unexpected memory type, expected to be allocated in " + + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got " + + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + + memory_type_id.get() + " for " + name); + } + + // We make a copy of the data here... which we could avoid for + // performance reasons but ok for this sequence example. + BytePointer odata = new BytePointer(byte_size.get()); + output_data.put(name, odata); + System.out.println(name + " is stored in system memory"); + odata.put(base.limit(byte_size.get())); + } + + int out = new IntPointer(output_data.get(output0)).get(0); + System.out.println("Value: " + out); + if (sequence_end) { + expected_result = + GetExpectedResult(model_name, expected_result, input_value, "end"); + if (out != expected_result) { + FAIL("Expected result: " + expected_result + ", got " + out); + } else { + System.out.println(model_name + " test PASSED"); + } + } + } + + // Boilerplate main function to run inference + // for provided model, custom setting of + // sequence metadata + public static void main(String[] args) throws Exception + { + String model_repository_path = null; + String model_name = null; + int verbose_level = 0; + + // Parse commandline... + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "-m": + model_name = args[++i]; + break; + case "-r": + model_repository_path = args[++i]; + break; + case "-v": + verbose_level = 1; + break; + case "-?": + Usage(null); + break; + } + } + + if (model_name == null) { + Usage("-m must be used to specify model name"); + } + if (model_repository_path == null) { + Usage("-r must be used to specify model repository path"); + } + + // Check API version. + int[] api_version_major = {0}, api_version_minor = {0}; + FAIL_IF_ERR( + TRITONSERVER_ApiVersion(api_version_major, api_version_minor), + "getting Triton API version"); + if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) + || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) { + FAIL("triton server API version mismatch"); + } + + // Create the server... + TRITONSERVER_ServerOptions server_options = + new TRITONSERVER_ServerOptions(null); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsNew(server_options), + "creating server options"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetModelRepositoryPath( + server_options, model_repository_path), + "setting model repository path"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level), + "setting verbose logging level"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetBackendDirectory( + server_options, "/opt/tritonserver/backends"), + "setting backend directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetRepoAgentDirectory( + server_options, "/opt/tritonserver/repoagents"), + "setting repository agent directory"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true), + "setting strict model configuration"); + + TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null); + FAIL_IF_ERR( + TRITONSERVER_ServerNew(server_ptr, server_options), "creating server"); + FAIL_IF_ERR( + TRITONSERVER_ServerOptionsDelete(server_options), + "deleting server options"); + + TRITONSERVER_ServerDeleter server = + new TRITONSERVER_ServerDeleter(server_ptr); + + // Wait until the server is both live and ready. + int health_iters = 0; + while (true) { + boolean[] live = {false}, ready = {false}; + FAIL_IF_ERR( + TRITONSERVER_ServerIsLive(server, live), + "unable to get server liveness"); + FAIL_IF_ERR( + TRITONSERVER_ServerIsReady(server, ready), + "unable to get server readiness"); + System.out.println( + "Server Health: live " + live[0] + ", ready " + ready[0]); + if (live[0] && ready[0]) { + break; + } + + if (++health_iters >= 10) { + FAIL("failed to find healthy inference server"); + } + + Thread.sleep(500); + } + + // Print status of the server. + { + TRITONSERVER_Message server_metadata_message = + new TRITONSERVER_Message(null); + FAIL_IF_ERR( + TRITONSERVER_ServerMetadata(server, server_metadata_message), + "unable to get server metadata message"); + BytePointer buffer = new BytePointer((Pointer) null); + SizeTPointer byte_size = new SizeTPointer(1); + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + server_metadata_message, buffer, byte_size), + "unable to serialize server metadata message"); + + System.out.println("Server Status:"); + System.out.println(buffer.limit(byte_size.get()).getString()); + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(server_metadata_message), + "deleting status metadata"); + } + + // Wait for the model to become available. + boolean[] is_torch_model = {false}; + boolean[] is_ready = {false}; + health_iters = 0; + while (!is_ready[0]) { + FAIL_IF_ERR( + TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready), + "unable to get model readiness"); + if (!is_ready[0]) { + if (++health_iters >= 10) { + FAIL("model failed to be ready in 10 iterations"); + } + Thread.sleep(500); + continue; + } + + TRITONSERVER_Message model_metadata_message = + new TRITONSERVER_Message(null); + FAIL_IF_ERR( + TRITONSERVER_ServerModelMetadata( + server, model_name, 1, model_metadata_message), + "unable to get model metadata message"); + BytePointer buffer = new BytePointer((Pointer) null); + SizeTPointer byte_size = new SizeTPointer(1); + FAIL_IF_ERR( + TRITONSERVER_MessageSerializeToJson( + model_metadata_message, buffer, byte_size), + "unable to serialize model status protobuf"); + + JsonParser parser = new JsonParser(); + JsonObject model_metadata = null; + try { + model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()) + .getAsJsonObject(); + } + catch (Exception e) { + FAIL("error: failed to parse model metadata from JSON: " + e); + } + + FAIL_IF_ERR( + TRITONSERVER_MessageDelete(model_metadata_message), + "deleting status protobuf"); + + if (!model_metadata.get("name").getAsString().equals(model_name)) { + FAIL("unable to find metadata for model"); + } + + boolean found_version = false; + if (model_metadata.has("versions")) { + for (JsonElement version : + model_metadata.get("versions").getAsJsonArray()) { + if (version.getAsString().equals("1")) { + found_version = true; + break; + } + } + } + if (!found_version) { + FAIL("unable to find version 1 status for model"); + } + + FAIL_IF_ERR( + ParseModelMetadata(model_metadata, is_torch_model), + "parsing model metadata"); + } + + // Create the allocator that will be used to allocate buffers for + // the result tensors. + TRITONSERVER_ResponseAllocator allocator = + new TRITONSERVER_ResponseAllocator(null); + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorNew( + allocator, responseAlloc, responseRelease, null /* start_fn */), + "creating response allocator"); + + // Inference + TRITONSERVER_InferenceRequest irequest = + new TRITONSERVER_InferenceRequest(null); + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestNew( + irequest, server, model_name, -1 /* model_version */), + "creating inference request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"), + "setting ID for the request"); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetReleaseCallback( + irequest, inferRequestComplete, null /* request_release_userp */), + "setting request release callback"); + + // Inputs + String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT"; + + long[] input0_shape = {1}; + + int datatype = TRITONSERVER_TYPE_INT32; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddInput( + irequest, input0, datatype, input0_shape, input0_shape.length), + "setting input 0 meta-data for the request"); + + String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT"; + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0), + "requesting output 0 for the request"); + + // Non-zero ID for the sequence requests + long correlation_id = 5; + // Number of requests in the sequence + int num_requests = 9; + // Expected_result is 1+2+3+...+num_requests + int expected_result = num_requests * (1 + num_requests) / 2; + boolean sequence_start = true; + boolean sequence_end = false; + + // Create the initial data for the input tensor. + IntPointer[] p0 = {new IntPointer(1)}; + BytePointer input0_data = p0[0].getPointer(BytePointer.class); + long input0_size = input0_data.limit(); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestAppendInputData( + irequest, input0, input0_data, input0_size, requested_memory_type, + 0 /* memory_type_id */), + "assigning INPUT0 data"); + + for (int i = 0; i < num_requests; i++) { + // Update input value + int input = i + 1; + p0[0].put(0, input); + + // Set sequence metadata + if (i == 1) { + sequence_start = false; + } + if (i == num_requests - 1) { + sequence_end = true; + } + SetSequenceMetadata( + irequest, correlation_id, sequence_start, sequence_end); + + // Perform inference... + CompletableFuture completed = + new CompletableFuture<>(); + futures.put(irequest, completed); + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestSetResponseCallback( + irequest, allocator, null /* response_allocator_userp */, + inferResponseComplete, irequest), + "setting response callback"); + + FAIL_IF_ERR( + TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), + "running inference"); + + // Wait for the inference to complete. + TRITONSERVER_InferenceResponse completed_response = completed.get(); + futures.remove(irequest); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseError(completed_response), + "response status"); + + Check( + model_name, completed_response, input, output0, input0_size, datatype, + sequence_end, expected_result); + + FAIL_IF_ERR( + TRITONSERVER_InferenceResponseDelete(completed_response), + "deleting inference response"); + } + + FAIL_IF_ERR( + TRITONSERVER_InferenceRequestDelete(irequest), + "deleting inference request"); + + FAIL_IF_ERR( + TRITONSERVER_ResponseAllocatorDelete(allocator), + "deleting response allocator"); + + System.exit(0); + } +} diff --git a/qa/L0_java_sequence_batcher/test.sh b/qa/L0_java_sequence_batcher/test.sh new file mode 100755 index 0000000000..2f988322d9 --- /dev/null +++ b/qa/L0_java_sequence_batcher/test.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# Models +DATADIR=/data/inferenceserver/${REPO_VERSION} +JAVACPP_BRANCH=${JAVACPP_BRANCH:="https://github.com/bytedeco/javacpp-presets.git"} +JAVACPP_BRANCH_TAG=${JAVACPP_BRANCH_TAG:="master"} + +# Set up test files based on installation instructions +# https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/README.md +set -e +git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git +source client/src/java-api-bindings/scripts/install_dependencies_and_build.sh -b $PWD --javacpp-branch ${JAVACPP_BRANCH} --javacpp-tag ${JAVACPP_BRANCH_TAG} --keep-build-dependencies +cd .. + +CLIENT_LOG="client.log" +MODEL_REPO=`pwd`/models +SAMPLES_REPO=`pwd`/javacpp-presets/tritonserver/samples/simple +BASE_COMMAND="mvn clean compile -f $SAMPLES_REPO exec:java -Djavacpp.platform=linux-x86_64" +source ../common/util.sh + +cp SequenceTest.java $SAMPLES_REPO +sed -i 's/Simple/SequenceTest/g' $SAMPLES_REPO/pom.xml + +rm -f *.log +RET=0 + +for BACKEND in graphdef libtorch onnx savedmodel; do + # Create local model repository + mkdir -p ${MODEL_REPO} + MODEL=${BACKEND}_nobatch_sequence_int32 + cp -r $DATADIR/qa_sequence_model_repository/${MODEL}/ ${MODEL_REPO}/ + sed -i "s/kind: KIND_GPU/kind: KIND_CPU/" ${MODEL_REPO}/$MODEL/config.pbtxt + + # Run with default settings + $BASE_COMMAND -Dexec.args="-r $MODEL_REPO -m ${MODEL}" >>client.log 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + # Check results + if [ `grep -c "${MODEL} test PASSED" ${CLIENT_LOG}` != "1" ]; then + echo -e "\n***\n*** ${BACKEND} sequence batcher test FAILED. Expected '${MODEL} test PASSED'\n***" + RET=1 + fi + rm -r ${MODEL_REPO} + rm ${CLIENT_LOG} +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_java_simple_example/test.sh b/qa/L0_java_simple_example/test.sh new file mode 100755 index 0000000000..e9726edff4 --- /dev/null +++ b/qa/L0_java_simple_example/test.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Set up test files based on installation instructions +# https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/README.md +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi + +JAVACPP_BRANCH=${JAVACPP_BRANCH:="https://github.com/bytedeco/javacpp-presets.git"} +JAVACPP_BRANCH_TAG=${JAVACPP_BRANCH_TAG:="master"} +set -e +git clone --single-branch --depth=1 -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git +source client/src/java-api-bindings/scripts/install_dependencies_and_build.sh -b $PWD --javacpp-branch ${JAVACPP_BRANCH} --javacpp-tag ${JAVACPP_BRANCH_TAG} --keep-build-dependencies +cd .. + +CLIENT_LOG="client_cpu_only.log" +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +MODEL_REPO=`pwd`/models + +SAMPLES_REPO=`pwd`/javacpp-presets/tritonserver/samples/simple +BASE_COMMAND="mvn clean compile -f $SAMPLES_REPO exec:java -Djavacpp.platform=linux-x86_64" +source ../common/util.sh + + +rm -f *.log +RET=0 + +function run_cpu_tests_int32() { + # Create local model repository + set +e + rm -r ${MODEL_REPO} + cp -r `pwd`/../L0_simple_ensemble/models . + mkdir ${MODEL_REPO}/ensemble_add_sub_int32_int32_int32/1 + set -e + + # Run with default settings + $BASE_COMMAND -Dexec.args="-r $MODEL_REPO" >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "Failed to run: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO}\"" + RET=1 + fi + + if [ `grep -c "1 - 1 = 0" ${CLIENT_LOG}` != "18" ]; then + echo -e "\n***\n*** Failed. Expected 18 '1 - 1 = 0'\n***" + RET=1 + fi + + # Run with verbose logging + $BASE_COMMAND -Dexec.args="-r $MODEL_REPO -v" >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "Failed to run: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO} -v\"" + RET=1 + fi + + if [ `grep -c "Server side auto-completed config" ${CLIENT_LOG}` != "2" ]; then + echo -e "\n***\n*** Failed. Expected 'Server side auto-completed config'\n***" + RET=1 + fi + + # Run with memory set to system + $BASE_COMMAND -Dexec.args="-r $MODEL_REPO -m system" >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "Failed to run: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO} -m system\"" + RET=1 + fi + + if [ `grep -c "OUTPUT0 is stored in system memory" ${CLIENT_LOG}` != "9" ]; then + echo -e "\n***\n*** Failed. Expected 9 'OUTPUT0 is stored in system memory'\n***" + RET=1 + fi + +} + +function run_cpu_tests_fp32() { + for trial in graphdef savedmodel; do + full=${trial}_float32_float32_float32 + set +e + rm -rf ${MODEL_REPO} + mkdir -p ${MODEL_REPO}/simple/1 && \ + cp -r $DATADIR/${full}/1/* ${MODEL_REPO}/simple/1/. && \ + cp $DATADIR/${full}/config.pbtxt ${MODEL_REPO}/simple/. && \ + (cd ${MODEL_REPO}/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt && \ + sed -i "s/label_filename:.*//" config.pbtxt) + + + # No memory type enforcement + $BASE_COMMAND -Dexec.args="-r $MODEL_REPO -v" >>$CLIENT_LOG.$full.log 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG.$full.log + echo -e "Failed to run: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO} -v\" for ${full}" + RET=1 + fi + + # Enforce I/O to be in specific memory type + for MEM_TYPE in system; do + $BASE_COMMAND -Dexec.args="-r $MODEL_REPO -m ${MEM_TYPE}" >>$CLIENT_LOG.$full.${MEM_TYPE}.log 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG.$full.$MEM_TYPE.log + echo -e "Failed to run: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO} -v -m ${MEM_TYPE}\" for ${full}" + RET=1 + fi + done + done + set -e +} + + +# Run ensemble +function run_ensemble_tests() { + set +e + rm -r ${MODEL_REPO} + cp -r `pwd`/../L0_simple_ensemble/models . + mkdir -p ${MODEL_REPO}/ensemble_add_sub_int32_int32_int32/1 + sed -i 's/"simple"/"ensemble_add_sub_int32_int32_int32"/g' $SAMPLES_REPO/Simple.java + cat $SAMPLES_REPO/pom.xml >>$CLIENT_LOG 2>&1 + set -e + + $BASE_COMMAND -Dexec.args="-r $MODEL_REPO -v" >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "Failed to run ensemble model: ${BASE_COMMAND} -Dexec.args=\"-r ${MODEL_REPO} -v\"" + RET=1 + fi + sed -i 's/"ensemble_add_sub_int32_int32_int32"/"simple"/g' $SAMPLES_REPO/Simple.java + + if [ `grep -c "request id: my_request_id, model: ensemble_add_sub_int32_int32_int32" ${CLIENT_LOG}` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 'request id: my_request_id, model: ensemble_add_sub_int32_int32_int32'\n***" + RET=1 + fi +} + +# Run tests on simple example +echo -e "\nRunning Simple Tests\n" + +run_cpu_tests_fp32 +run_cpu_tests_int32 +run_ensemble_tests + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_json/test.sh b/qa/L0_json/test.sh new file mode 100755 index 0000000000..39aa07f040 --- /dev/null +++ b/qa/L0_json/test.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +RET=0 +UNIT_TEST="./triton_json_test --gtest_output=xml:triton_json.report.xml" +TEST_LOG="./triton_json_test.log" +$UNIT_TEST >> $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $TEST_LOG + echo -e "\n***\n*** Triton Json Unit Test Failed\n***" + RET=1 +fi + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_large_payload/large_payload_test.py b/qa/L0_large_payload/large_payload_test.py new file mode 100755 index 0000000000..f9c0a49dfd --- /dev/null +++ b/qa/L0_large_payload/large_payload_test.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import math +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException, np_to_triton_dtype + + +class LargePayLoadTest(tu.TestResultCollector): + def setUp(self): + self._data_type = np.float32 + + # Very large tensor will always fail for gRPC because the Protobuf has a + # hard limit on 2GBs for the size of input tensors. All backends except + # plan backend should be able to handle payloads larger than 2GBs using + # HTTP. + very_large_tensor_shape = ( + math.trunc(3 * (1024 * 1024 * 1024) / np.dtype(self._data_type).itemsize), + ) + self._very_large_in0 = np.random.random(very_large_tensor_shape).astype( + self._data_type + ) + + # 1.9 GBs allows us to test gRPC with moderate sizes too. + large_tensor_shape = ( + math.trunc( + 1.9 * (1024 * 1024 * 1024) // np.dtype(self._data_type).itemsize + ), + ) + self._large_in0 = np.random.random(large_tensor_shape).astype(self._data_type) + + small_tensor_shape = (1,) + self._small_in0 = np.random.random(small_tensor_shape).astype(self._data_type) + + self._clients = ( + (httpclient, httpclient.InferenceServerClient("localhost:8000")), + (grpcclient, grpcclient.InferenceServerClient("localhost:8001")), + ) + + def _test_helper( + self, client, model_name, input_name="INPUT0", output_name="OUTPUT0" + ): + # plan does not support large batch sizes. + if not model_name.startswith("plan"): + inputs = [ + client[0].InferInput( + input_name, + self._large_in0.shape, + np_to_triton_dtype(self._data_type), + ) + ] + inputs[0].set_data_from_numpy(self._large_in0) + results = client[1].infer(model_name, inputs) + + # if the inference is completed, examine results to ensure that + # the framework and protocol do support large payload + self.assertTrue( + np.array_equal(self._large_in0, results.as_numpy(output_name)), + "output is different from input", + ) + + if client[0] == httpclient: + # FIXME HTTPServer cannot support large payloads. See DLIS-1776. + inputs = [ + client[0].InferInput( + input_name, + self._very_large_in0.shape, + np_to_triton_dtype(self._data_type), + ) + ] + inputs[0].set_data_from_numpy(self._very_large_in0) + with self.assertRaises(InferenceServerException): + results = client[1].infer(model_name, inputs) + + # FIXME Test is terminated due to libprotobuf FATAL error when GRPC sends + # the second request with input tensors larger than 1.3GBs. In this test + # GRPC has been currently exempted from testing for Very Large tensor(3GBs) + # until the problem is resolved. Should be uncommented once the GRPC issue is resolved. + # See DLIS-2474. + # if client[0] == grpcclient: + # inputs = [ + # client[0].InferInput(input_name, self._very_large_in0.shape, + # np_to_triton_dtype(self._data_type)) + # ] + # inputs[0].set_data_from_numpy(self._very_large_in0) + # # GRPC must fail for large payloads because of a 2GB protobuf limit + # with self.assertRaises(InferenceServerException): + # results = client[1].infer(model_name, inputs) + + # Send a small payload to verify if the server is still functional + inputs = [ + client[0].InferInput( + input_name, self._small_in0.shape, np_to_triton_dtype(self._data_type) + ) + ] + inputs[0].set_data_from_numpy(self._small_in0) + results = client[1].infer(model_name, inputs) + self.assertTrue( + np.array_equal(self._small_in0, results.as_numpy(output_name)), + "output is different from input", + ) + + def test_graphdef(self): + # graphdef_nobatch_zero_1_float32 is identity model with input shape [-1] + for client in self._clients: + model_name = tu.get_zero_model_name("graphdef_nobatch", 1, self._data_type) + self._test_helper(client, model_name) + + def test_savedmodel(self): + # savedmodel_nobatch_zero_1_float32 is identity model with input shape [-1] + for client in self._clients: + model_name = tu.get_zero_model_name( + "savedmodel_nobatch", 1, self._data_type + ) + self._test_helper(client, model_name) + + def test_onnx(self): + # onnx_nobatch_zero_1_float32 is identity model with input shape [-1] + for client in self._clients: + model_name = tu.get_zero_model_name("onnx_nobatch", 1, self._data_type) + self._test_helper(client, model_name) + + def test_python(self): + # python_nobatch_zero_1_float32 is identity model with input shape [-1] + for client in self._clients: + model_name = tu.get_zero_model_name("python_nobatch", 1, self._data_type) + self._test_helper(client, model_name) + + def test_plan(self): + # plan_nobatch_zero_1_float32 is identity model with input shape [-1] + for client in self._clients: + model_name = tu.get_zero_model_name("plan_nobatch", 1, self._data_type) + self._test_helper(client, model_name) + + def test_libtorch(self): + # libtorch_nobatch_zero_1_float32 is identity model with input shape [-1] + for client in self._clients: + model_name = tu.get_zero_model_name("libtorch_nobatch", 1, self._data_type) + self._test_helper(client, model_name, "INPUT__0", "OUTPUT__0") + + def test_custom(self): + # custom_zero_1_float32 is identity model with input shape [-1] + for client in self._clients: + model_name = tu.get_zero_model_name("custom", 1, self._data_type) + self._test_helper(client, model_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_large_payload/test.sh b/qa/L0_large_payload/test.sh new file mode 100755 index 0000000000..325cab4ed5 --- /dev/null +++ b/qa/L0_large_payload/test.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +LARGE_PAYLOAD_TEST_PY=large_payload_test.py +CLIENT_LOG_BASE="./client.log" +DATADIR=`pwd`/models + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR --log-verbose=1" +SERVER_LOG_BASE="./inference_server.log" +source ../common/util.sh + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG_BASE* + +RET=0 + +MODEL_SUFFIX=nobatch_zero_1_float32 +rm -fr all_models && mkdir all_models +for TARGET in graphdef savedmodel onnx libtorch plan; do + cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/${TARGET}_$MODEL_SUFFIX \ + all_models/. +done + +mkdir -p all_models/python_$MODEL_SUFFIX/1/ +cp ../python_models/identity_fp32/config.pbtxt all_models/python_$MODEL_SUFFIX/ +(cd all_models/python_$MODEL_SUFFIX && \ + sed -i "s/max_batch_size: 64/max_batch_size: 0/" config.pbtxt && \ + sed -i "s/name: \"identity_fp32\"/name: \"python_$MODEL_SUFFIX\"/" config.pbtxt) + +cp ../python_models/identity_fp32/model.py all_models/python_$MODEL_SUFFIX/1/model.py + +# Restart server before every test to make sure server state +# is invariant to previous test +for TARGET in graphdef savedmodel onnx libtorch plan python; do + rm -fr models && mkdir models && \ + cp -r all_models/${TARGET}_$MODEL_SUFFIX models/. + + SERVER_LOG=$SERVER_LOG_BASE.$TARGET + CLIENT_LOG=$CLIENT_LOG_BASE.$TARGET + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python $LARGE_PAYLOAD_TEST_PY LargePayLoadTest.test_$TARGET >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_libtorch_disable_cudnn/test.sh b/qa/L0_libtorch_disable_cudnn/test.sh new file mode 100755 index 0000000000..9b53f5f8bb --- /dev/null +++ b/qa/L0_libtorch_disable_cudnn/test.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +LIBTORCH_INFER_CLIENT_PY=../common/libtorch_infer_client.py + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --log-verbose=1" +SERVER_LOG="./inference_server.log" +CLIENT_LOG="./client.log" +source ../common/util.sh + +RET=0 + +for FLAG in true false; do + rm -f *.log + mkdir -p models && cp -r $DATADIR/libtorch_int32_int32_int32 models/. + + echo """ + parameters: { + key: \"DISABLE_CUDNN\" + value: { + string_value: \"$FLAG\" + } + }""" >> models/libtorch_int32_int32_int32/config.pbtxt + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python $LIBTORCH_INFER_CLIENT_PY >> $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + CUDNN_LOG="cuDNN is " + if [ "$FLAG" == "true" ]; then + CUDNN_LOG+=disabled + else + CUDNN_LOG+=enabled + fi + + if [ `grep -c "$CUDNN_LOG" $SERVER_LOG` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 $CUDNN_LOG in log\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + + rm -rf models +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_libtorch_inference_mode/test.sh b/qa/L0_libtorch_inference_mode/test.sh new file mode 100755 index 0000000000..85b4a49fae --- /dev/null +++ b/qa/L0_libtorch_inference_mode/test.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +LIBTORCH_INFER_CLIENT_PY=../common/libtorch_infer_client.py + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --log-verbose=1" +SERVER_LOG="./inference_server.log" +CLIENT_LOG="./client.log" +source ../common/util.sh + +RET=0 + +for FLAG in true false; do + rm -f *.log + mkdir -p models && cp -r $DATADIR/libtorch_int32_int32_int32 models/. + + echo """ + parameters: { + key: \"INFERENCE_MODE\" + value: { + string_value: \"$FLAG\" + } + }""" >> models/libtorch_int32_int32_int32/config.pbtxt + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python $LIBTORCH_INFER_CLIENT_PY >> $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + INFERMODE_LOG="Inference Mode is " + if [ "$FLAG" == "true" ]; then + INFERMODE_LOG+=enabled + else + INFERMODE_LOG+=disabled + fi + + if [ `grep -c "$INFERMODE_LOG" $SERVER_LOG` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 $INFERMODE_LOG in log\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + + rm -rf models +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py new file mode 100755 index 0000000000..92bead3464 --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/client.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.http as httpclient + +# By default, find tritonserver on "localhost", but can be overridden +# with TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +class InferTest(tu.TestResultCollector): + def test_infer(self): + try: + triton_client = httpclient.InferenceServerClient( + url=f"{_tritonserver_ipaddr}:8000" + ) + except Exception as e: + print("channel creation failed: " + str(e)) + sys.exit(1) + + model_name = os.environ["MODEL_NAME"] + + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32")) + + # Create the data for the two input tensors. + input0_data = np.arange(start=0, stop=16, dtype=np.float32) + input0_data = np.expand_dims(input0_data, axis=0) + input1_data = np.arange(start=32, stop=48, dtype=np.float32) + input1_data = np.expand_dims(input1_data, axis=0) + + # Initialize the data + inputs[0].set_data_from_numpy(input0_data, binary_data=True) + inputs[1].set_data_from_numpy(input1_data, binary_data=True) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT__0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT__1", binary_data=True)) + + results = triton_client.infer(model_name, inputs, outputs=outputs) + + output0_data = results.as_numpy("OUTPUT__0") + output1_data = results.as_numpy("OUTPUT__1") + + expected_output_0 = input0_data + input1_data + expected_output_1 = input0_data - input1_data + + self.assertEqual(output0_data.shape, (1, 16)) + self.assertEqual(output1_data.shape, (1, 16)) + + self.assertTrue(np.all(expected_output_0 == output0_data)) + self.assertTrue(np.all(expected_output_1 == output1_data)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py new file mode 100755 index 0000000000..e61980f491 --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py @@ -0,0 +1,90 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import torch.nn as nn + + +class SumModule(nn.Module): + def __init__(self, device): + super(SumModule, self).__init__() + self.device = device + + def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) + print( + "SumModule - INPUT0 device: {}, INPUT1 device: {}\n".format( + INPUT0.device, INPUT1.device + ) + ) + return INPUT0 + INPUT1 + + +class DiffModule(nn.Module): + def __init__(self, device): + super(DiffModule, self).__init__() + self.device = device + + def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) + print( + "DiffModule - INPUT0 device: {}, INPUT1 device: {}\n".format( + INPUT0.device, INPUT1.device + ) + ) + return INPUT0 - INPUT1 + + +class TestModel(nn.Module): + def __init__(self, device0, device1): + super(TestModel, self).__init__() + self.device0 = device0 + self.device1 = device1 + + self.layer1 = SumModule(self.device0) + self.layer2 = DiffModule(self.device1) + + def forward(self, INPUT0, INPUT1): + op0 = self.layer1(INPUT0, INPUT1) + op1 = self.layer2(INPUT0, INPUT1) + return op0, op1 + + +if torch.cuda.device_count() < 4: + print("Need at least 4 GPUs to run this test") + exit(1) + +devices = [("cuda:2", "cuda:0"), ("cpu", "cuda:3")] +model_names = ["libtorch_multi_gpu", "libtorch_multi_device"] + +for device_pair, model_name in zip(devices, model_names): + model = TestModel(device_pair[0], device_pair[1]) + model_path = "models/" + model_name + "/1/model.pt" + scripted_model = torch.jit.script(model) + scripted_model.save(model_path) diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt new file mode 100644 index 0000000000..bf8ca0d649 --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt @@ -0,0 +1,60 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "libtorch_multi_device" +platform: "pytorch_libtorch" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT__0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT__1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] + +instance_group [ + { + kind: KIND_MODEL + } +] diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh new file mode 100755 index 0000000000..04d76bd036 --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/test.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +pip3 uninstall -y torch +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --log-verbose=1" +SERVER_LOG="./inference_server.log" + +CLIENT_PY=./client.py +CLIENT_LOG="./client.log" +EXPECTED_NUM_TESTS="1" +TEST_RESULT_FILE='test_results.txt' + +source ../common/util.sh + +RET=0 + +rm -f *.log *.txt + +mkdir -p models/libtorch_multi_device/1 +mkdir -p models/libtorch_multi_gpu/1 +cp models/libtorch_multi_device/config.pbtxt models/libtorch_multi_gpu/. +(cd models/libtorch_multi_gpu && \ + sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) + +# Generate the models which are partitioned across multiple devices +set +e +python3 gen_models.py >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Error when generating models. \n***" + cat $CLIENT_LOG + exit 1 +fi +set -e + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +export MODEL_NAME='libtorch_multi_device' +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +MESSAGES=("SumModule - INPUT0 device: cpu, INPUT1 device: cpu" + "DiffModule - INPUT0 device: cuda:3, INPUT1 device: cuda:3") +for MESSAGE in "${MESSAGES[@]}"; do + if grep -q "$MESSAGE" "$SERVER_LOG"; then + echo -e "Found \"$MESSAGE\"" >> "$CLIENT_LOG" + else + echo -e "Not found \"$MESSAGE\"" >> "$CLIENT_LOG" + RET=1 + fi +done + +export MODEL_NAME='libtorch_multi_gpu' +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +MESSAGES=("SumModule - INPUT0 device: cuda:2, INPUT1 device: cuda:2" + "DiffModule - INPUT0 device: cuda:0, INPUT1 device: cuda:0") +for MESSAGE in "${MESSAGES[@]}"; do + if grep -q "$MESSAGE" "$SERVER_LOG"; then + echo -e "Found \"$MESSAGE\"" >> "$CLIENT_LOG" + else + echo -e "Not found \"$MESSAGE\"" >> "$CLIENT_LOG" + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_libtorch_io_names/io_names_client.py b/qa/L0_libtorch_io_names/io_names_client.py new file mode 100755 index 0000000000..b74e520de2 --- /dev/null +++ b/qa/L0_libtorch_io_names/io_names_client.py @@ -0,0 +1,119 @@ +#!/usr/bin/python +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest +from builtins import range + +import numpy as np +import test_util as tu +import tritonclient.http as httpclient + + +class IONamingConvention(tu.TestResultCollector): + def _infer_helper(self, model_name, io_names, reversed_order=False): + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=False + ) + + # Create the data for the two inputs. Initialize the first to unique + # integers and the second to all ones. + input0_data = np.arange(start=0, stop=16, dtype=np.float32) + input0_data = np.expand_dims(input0_data, axis=0) + input1_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.float32) + + inputs = [] + output_req = [] + inputs.append( + httpclient.InferInput( + io_names[0] if not reversed_order else io_names[1], [1, 16], "FP32" + ) + ) + inputs[-1].set_data_from_numpy(input0_data) + inputs.append( + httpclient.InferInput( + io_names[1] if not reversed_order else io_names[0], [1, 16], "FP32" + ) + ) + inputs[-1].set_data_from_numpy(input1_data) + output_req.append( + httpclient.InferRequestedOutput(io_names[2], binary_data=True) + ) + output_req.append( + httpclient.InferRequestedOutput(io_names[3], binary_data=True) + ) + + results = triton_client.infer(model_name, inputs, outputs=output_req) + + output0_data = results.as_numpy( + io_names[2] if not reversed_order else io_names[3] + ) + output1_data = results.as_numpy( + io_names[3] if not reversed_order else io_names[2] + ) + for i in range(16): + self.assertEqual(input0_data[0][i] - input1_data[0][i], output0_data[0][i]) + self.assertEqual(input0_data[0][i] + input1_data[0][i], output1_data[0][i]) + + def test_io_index(self): + io_names = ["INPUT__0", "INPUT__1", "OUTPUT__0", "OUTPUT__1"] + self._infer_helper("libtorch_io_index", io_names) + + def test_output_index(self): + io_names = ["INPUT0", "INPUT1", "OUTPUT__0", "OUTPUT__1"] + self._infer_helper("libtorch_output_index", io_names) + + def test_no_output_index(self): + io_names = ["INPUT0", "INPUT1", "OUTPUT0", "OUTPUT1"] + self._infer_helper("libtorch_no_output_index", io_names) + + def test_no_arguments_no_output_index(self): + io_names = ["INPUTA", "INPUTB", "OUTPUTA", "OUTPUTB"] + self._infer_helper("libtorch_no_arguments_output_index", io_names) + + def test_mix_index(self): + io_names = ["INPUTA", "INPUT__1", "OUTPUTA", "OUTPUT__1"] + self._infer_helper("libtorch_mix_index", io_names) + + def test_mix_arguments(self): + io_names = ["INPUT0", "INPUTB", "OUTPUTA", "OUTPUT__1"] + self._infer_helper("libtorch_mix_arguments", io_names) + + def test_mix_arguments_index(self): + io_names = ["INPUT0", "INPUT__1", "OUTPUT0", "OUTPUT__1"] + self._infer_helper("libtorch_mix_arguments_index", io_names) + + def test_unordered_index(self): + io_names = ["INPUT1", "INPUT0", "OUT__1", "OUT__0"] + self._infer_helper("libtorch_unordered_index", io_names, reversed_order=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_libtorch_io_names/test.sh b/qa/L0_libtorch_io_names/test.sh new file mode 100755 index 0000000000..999bb2b513 --- /dev/null +++ b/qa/L0_libtorch_io_names/test.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +IO_NAMES_CLIENT=./io_names_client.py +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository + +rm -rf models && mkdir -p models + +# Prepare models +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_output_index && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_output_index/' models/libtorch_output_index/config.pbtxt + +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_io_index && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_io_index/' models/libtorch_io_index/config.pbtxt && \ + sed -i 's/INPUT0/INPUT__0/' models/libtorch_io_index/config.pbtxt && \ + sed -i 's/INPUT1/INPUT__1/' models/libtorch_io_index/config.pbtxt + +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_no_output_index && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_no_output_index/' models/libtorch_no_output_index/config.pbtxt && \ + sed -i 's/OUTPUT__0/OUTPUT0/' models/libtorch_no_output_index/config.pbtxt && \ + sed -i 's/OUTPUT__1/OUTPUT1/' models/libtorch_no_output_index/config.pbtxt + +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_no_arguments_output_index && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_no_arguments_output_index/' models/libtorch_no_arguments_output_index/config.pbtxt && \ + sed -i 's/INPUT0/INPUTA/' models/libtorch_no_arguments_output_index/config.pbtxt && \ + sed -i 's/INPUT1/INPUTB/' models/libtorch_no_arguments_output_index/config.pbtxt && \ + sed -i 's/OUTPUT__0/OUTPUTA/' models/libtorch_no_arguments_output_index/config.pbtxt && \ + sed -i 's/OUTPUT__1/OUTPUTB/' models/libtorch_no_arguments_output_index/config.pbtxt + +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_mix_index && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_mix_index/' models/libtorch_mix_index/config.pbtxt && \ + sed -i 's/INPUT0/INPUTA/' models/libtorch_mix_index/config.pbtxt && \ + sed -i 's/INPUT1/INPUT__1/' models/libtorch_mix_index/config.pbtxt && \ + sed -i 's/OUTPUT__0/OUTPUTA/' models/libtorch_mix_index/config.pbtxt + +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_mix_arguments && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_mix_arguments/' models/libtorch_mix_arguments/config.pbtxt && \ + sed -i 's/INPUT1/INPUTB/' models/libtorch_mix_arguments/config.pbtxt && \ + sed -i 's/OUTPUT__0/OUTPUTA/' models/libtorch_mix_arguments/config.pbtxt + +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_mix_arguments_index && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_mix_arguments_index/' models/libtorch_mix_arguments_index/config.pbtxt && \ + sed -i 's/INPUT1/INPUT__1/' models/libtorch_mix_arguments_index/config.pbtxt && \ + sed -i 's/OUTPUT__0/OUTPUT0/' models/libtorch_mix_arguments_index/config.pbtxt + +cp -r $DATADIR/libtorch_float32_float32_float32 models/libtorch_unordered_index && \ + sed -i 's/libtorch_float32_float32_float32/libtorch_unordered_index/' models/libtorch_unordered_index/config.pbtxt && \ + sed -i 's/INPUT0/INPUT_TMP1/' models/libtorch_unordered_index/config.pbtxt && \ + sed -i 's/INPUT1/INPUT0/' models/libtorch_unordered_index/config.pbtxt && \ + sed -i 's/INPUT_TMP1/INPUT1/' models/libtorch_unordered_index/config.pbtxt && \ + sed -i 's/OUTPUT__0/OUT__1/' models/libtorch_unordered_index/config.pbtxt && \ + sed -i 's/OUTPUT__1/OUT__0/' models/libtorch_unordered_index/config.pbtxt + + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f *.log + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +set +e + +CLIENT_LOG=client.log +python $IO_NAMES_CLIENT >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_libtorch_io_types/test.sh b/qa/L0_libtorch_io_types/test.sh new file mode 100755 index 0000000000..ddd38810b6 --- /dev/null +++ b/qa/L0_libtorch_io_types/test.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models" +SERVER_LOG="./server.log" +DATADIR=/data/inferenceserver/${REPO_VERSION} +source ../common/util.sh + +# Test unsupported INPUT data type +rm -rf models && mkdir -p models +cp -r $DATADIR/qa_model_repository/libtorch_int32_int8_int8 models/libtorch_invalid_input_type && \ + sed -i 's/libtorch_int32_int8_int8/libtorch_invalid_input_type/' models/libtorch_invalid_input_type/config.pbtxt && \ + sed -i 's/TYPE_INT32/TYPE_UINT32/' models/libtorch_invalid_input_type/config.pbtxt + +rm -f *.log + +run_server +if [ "$SERVER_PID" != "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Unexpected server start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + exit 1 +fi + +set +e +grep "unsupported datatype TYPE_UINT32 for input 'INPUT0' for model 'libtorch_invalid_input_type'" $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Unsupported INPUT datatype not found in server log\n***" + exit 1 +fi +set -e + +# Test unsupported OUTPUT data type +rm -rf models && mkdir -p models +cp -r $DATADIR/qa_model_repository/libtorch_int32_int8_int8 models/libtorch_invalid_output_type && \ + sed -i 's/libtorch_int32_int8_int8/libtorch_invalid_output_type/' models/libtorch_invalid_output_type/config.pbtxt && \ + sed -i 's/TYPE_INT8/TYPE_UINT64/' models/libtorch_invalid_output_type/config.pbtxt + +rm -f *.log + +run_server +if [ "$SERVER_PID" != "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Unexpected server start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + exit 1 +fi + +set +e +grep "unsupported datatype TYPE_UINT64 for output 'OUTPUT__0' for model 'libtorch_invalid_output_type'" $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Unsupported OUTPUT datatype not found in server log\n***" + exit 1 +fi +set -e + +# Test unsupported sequence_batching data type +rm -rf models && mkdir -p models +cp -r $DATADIR/qa_variable_sequence_model_repository/libtorch_sequence_int32 models/libtorch_invalid_sequence_int32 && \ + sed -i 's/libtorch_sequence_int32/libtorch_invalid_sequence_int32/' models/libtorch_invalid_sequence_int32/config.pbtxt && \ + sed -i 's/READY__2/CORRID__2/' models/libtorch_invalid_sequence_int32/config.pbtxt && \ + sed -i 's/CONTROL_SEQUENCE_READY/CONTROL_SEQUENCE_CORRID/' models/libtorch_invalid_sequence_int32/config.pbtxt && \ + sed -i ':begin;$!N;s/CORRID\n\(.*\)int32_false_true: \[ 0, 1 \]/CORRID\ndata_type: TYPE_UINT32/' models/libtorch_invalid_sequence_int32/config.pbtxt + +rm -f *.log + +run_server +if [ "$SERVER_PID" != "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Unexpected server start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + exit 1 +fi + +set +e +grep "input 'CORRID__2' type 'TYPE_UINT32' is not supported by PyTorch." $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Unsupported sequence_batching datatype not found in server log\n***" + exit 1 +fi +set -e + +# Test passed +echo -e "\n***\n*** Test Passed\n***" +exit 0 diff --git a/qa/L0_libtorch_optimized_execution/test.sh b/qa/L0_libtorch_optimized_execution/test.sh new file mode 100755 index 0000000000..5b7e19e282 --- /dev/null +++ b/qa/L0_libtorch_optimized_execution/test.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +LIBTORCH_INFER_CLIENT_PY=../common/libtorch_infer_client.py + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --log-verbose=1" +SERVER_LOG="./inference_server.log" +CLIENT_LOG="./client.log" +source ../common/util.sh + +RET=0 + +for FLAG in true false; do + rm -f *.log + mkdir -p models && cp -r $DATADIR/libtorch_int32_int32_int32 models/. + + echo """ + parameters: { + key: \"DISABLE_OPTIMIZED_EXECUTION\" + value: { + string_value: \"$FLAG\" + } + }""" >> models/libtorch_int32_int32_int32/config.pbtxt + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python $LIBTORCH_INFER_CLIENT_PY >> $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + OPTIMIZED_LOG="Optimized execution is " + if [ "$FLAG" == "true" ]; then + OPTIMIZED_LOG+=disabled + else + OPTIMIZED_LOG+=enabled + fi + + if [ `grep -c "$OPTIMIZED_LOG" $SERVER_LOG` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 $OPTIMIZED_LOG in log\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + + rm -rf models +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py b/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py new file mode 100755 index 0000000000..7c2fdb5a71 --- /dev/null +++ b/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest +from builtins import range + +import numpy as np +import test_util as tu +import tritonhttpclient as httpclient + +FLAGS = None + + +class SharedWeightsTest(tu.TestResultCollector): + def _full_exact(self, model_name, request_concurrency, shape): + # Run async requests to make sure backend handles concurrent requests + # correctly. + client = httpclient.InferenceServerClient( + "localhost:8000", concurrency=request_concurrency + ) + input_datas = [] + requests = [] + for i in range(request_concurrency): + input_data = (16384 * np.random.randn(*shape)).astype(np.float32) + input_datas.append(input_data) + inputs = [httpclient.InferInput("INPUT__0", input_data.shape, "FP32")] + inputs[0].set_data_from_numpy(input_data) + requests.append(client.async_infer(model_name, inputs)) + + for i in range(request_concurrency): + # Get the result from the initiated asynchronous inference request. + # Note the call will block until the server responds. + results = requests[i].get_result() + + output_data = results.as_numpy("OUTPUT__0") + self.assertIsNotNone(output_data, "error: expected 'OUTPUT__0' to be found") + np.testing.assert_allclose(output_data, input_datas[i]) + + def test_pytorch_identity_model(self): + model_name = "libtorch_nobatch_zero_1_float32" + self._full_exact(model_name, 128, [8]) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_libtorch_shared_weights/test.sh b/qa/L0_libtorch_shared_weights/test.sh new file mode 100755 index 0000000000..6ca251ce32 --- /dev/null +++ b/qa/L0_libtorch_shared_weights/test.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +DATADIR=/data/inferenceserver/${REPO_VERSION} +INSTANCE_CNT=16 +REUSE_MSG="Reusing TorchScript model for instance" +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --exit-on-error=false \ + --exit-timeout-secs=10" +TEST_RESULT_FILE='test_results.txt' +WEIGHTS_TEST=libtorch_shared_weights_test.py +source ../common/util.sh + +RET=0 +rm -fr *.log + +LOG_IDX=0 + +# SharedWeightsTest.test_pytorch_identity_model +# Without shared weights, GPU + +# Prepare model repository +rm -fr models +mkdir models +for i in models; do + cp -r $DATADIR/qa_identity_model_repository/libtorch_nobatch_zero_1_float32 models/. +done + +for MC in `ls models/libtorch*/config.pbtxt`; do + echo "instance_group [ { count: ${INSTANCE_CNT} kind: KIND_GPU}]" >> $MC +done + +# Start server +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Run test +rm -f $CLIENT_LOG +set +e +python $WEIGHTS_TEST SharedWeightsTest.test_pytorch_identity_model >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +if [ `grep -c "$REUSE_MSG" $SERVER_LOG` != "0" ]; then + echo -e "\n***\n*** Failed. Expected 0 "$REUSE_MSG"\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# SharedWeightsTest.test_pytorch_identity_model +# With shared weights + +for KIND in KIND_CPU KIND_GPU; do + + # Prepare model repository + rm -fr models + mkdir models + for i in models; do + cp -r $DATADIR/qa_identity_model_repository/libtorch_nobatch_zero_1_float32 models/. + done + + LOG_IDX=$((LOG_IDX+1)) + for MC in `ls models/libtorch*/config.pbtxt`; do + echo "instance_group [ { count: ${INSTANCE_CNT} kind: ${KIND}}]" >> $MC + done + + for MC in `ls models/libtorch*/config.pbtxt`; do + echo """ + parameters: { + key: \"ENABLE_WEIGHT_SHARING\" + value: { + string_value: \"true\" + } + }""" >> $MC + done + + # Start server + SERVER_LOG="./inference_server_$LOG_IDX.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + # Run test + rm -f $CLIENT_LOG + set +e + python $WEIGHTS_TEST SharedWeightsTest.test_pytorch_identity_model >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + if [ `grep -c "$REUSE_MSG" $SERVER_LOG` != "15" ]; then + echo -e "\n***\n*** Failed. Expected 15 "$REUSE_MSG"\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test Cleanup +rm -f $CLIENT_LOG + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_lifecycle/ensemble_zero_1_float32/config.pbtxt b/qa/L0_lifecycle/ensemble_zero_1_float32/config.pbtxt new file mode 100644 index 0000000000..9c6f37c9a9 --- /dev/null +++ b/qa/L0_lifecycle/ensemble_zero_1_float32/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble_zero_1_float32" +platform: "ensemble" +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "custom_zero_1_float32" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + } + ] +} diff --git a/qa/L0_lifecycle/identity_zero_1_int32/config.pbtxt b/qa/L0_lifecycle/identity_zero_1_int32/config.pbtxt new file mode 100644 index 0000000000..0f971aa5fd --- /dev/null +++ b/qa/L0_lifecycle/identity_zero_1_int32/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_zero_1_int32" +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters [ + { + key: "creation_delay_sec" + value: { string_value: "10" } + } +] diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py old mode 100644 new mode 100755 index 68f576aaf7..49fe684ff1 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,558 +27,1024 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + sys.path.append("../common") -from builtins import range -from future.utils import iteritems +import base64 +import concurrent.futures +import json +import multiprocessing import os import shutil +import signal +import threading import time import unittest -import numpy as np +from builtins import range +from functools import partial +from pathlib import Path + import infer_util as iu +import numpy as np import test_util as tu -from tensorrtserver.api import * -import tensorrtserver.api.server_status_pb2 as server_status +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException -class LifeCycleTest(unittest.TestCase): - def test_parse_error_noexit(self): - # Server was started with invalid args and - # --exit-on-error=false so expect it to be running with - # SERVER_FAILED_TO_INITIALIZE status. - # --strict-readiness=false so server is not live and not ready - try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], None, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_FAILED_TO_INITIALIZE, ss.ready_state) - self.assertEqual(len(ss.model_status), 0) - uptime = ss.uptime_ns - self.assertGreater(uptime, 0) - - hctx = ServerHealthContext(pair[0], pair[1], True) - self.assertFalse(hctx.is_ready()) - self.assertFalse(hctx.is_live()) +class LifeCycleTest(tu.TestResultCollector): + def _infer_success_models( + self, model_base_names, versions, tensor_shape, swap=False + ): + for base_name in model_base_names: + try: + model_name = tu.get_model_name( + base_name, np.float32, np.float32, np.float32 + ) + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + # FIXME is_server_ready should be true here DLIS-1296 + # self.assertTrue(triton_client.is_server_ready()) + for v in versions: + self.assertTrue( + triton_client.is_model_ready(model_name, str(v)) + ) + + for v in versions: + iu.infer_exact( + self, + base_name, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=v, + swap=(swap or (v != 1)), + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) - except InferenceServerException as ex: + def _infer_success_identity(self, model_base, versions, tensor_dtype, tensor_shape): + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + for v in versions: + self.assertTrue( + triton_client.is_model_ready( + tu.get_zero_model_name(model_base, 1, tensor_dtype), str(v) + ) + ) + + for v in versions: + iu.infer_zero( + self, + model_base, + 1, + tensor_dtype, + tensor_shape, + tensor_shape, + use_http=False, + use_grpc=True, + use_http_json_tensors=False, + use_streaming=False, + ) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - def test_parse_error_noexit_strict(self): + def _get_client(self, use_grpc=False): + if use_grpc: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + else: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + return triton_client + + def _async_load(self, model_name, use_grpc): + try: + triton_client = self._get_client(use_grpc) + triton_client.load_model(model_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_parse_error_noexit(self): # Server was started with invalid args and # --exit-on-error=false so expect it to be running with # SERVER_FAILED_TO_INITIALIZE status. - # --strict-readiness=false so server is not live and not ready - try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], None, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_FAILED_TO_INITIALIZE, ss.ready_state) - self.assertEqual(len(ss.model_status), 0) - uptime = ss.uptime_ns - self.assertGreater(uptime, 0) - - hctx = ServerHealthContext(pair[0], pair[1], True) - self.assertFalse(hctx.is_ready()) - self.assertFalse(hctx.is_live()) + # Server is not live and not ready regardless of --strict-readiness + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + self.assertFalse(triton_client.is_server_live()) + self.assertFalse(triton_client.is_server_ready()) + md = triton_client.get_server_metadata() + self.assertEqual(os.environ["TRITON_SERVER_VERSION"], md.version) + self.assertEqual("triton", md.name) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + self.assertFalse(triton_client.is_server_live()) + self.assertFalse(triton_client.is_server_ready()) + md = triton_client.get_server_metadata() + self.assertEqual(os.environ["TRITON_SERVER_VERSION"], md["version"]) + self.assertEqual("triton", md["name"]) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_parse_error_modelfail(self): # --strict-readiness=true so server is live but not ready - input_size = 16 - tensor_shape = (input_size,) + tensor_shape = (1, 16) # Server was started but with a model that fails to load try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - model_name = tu.get_model_name('graphdef', np.float32, np.float32, np.float32) - ctx = ServerStatusContext(pair[0], pair[1], model_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - uptime = ss.uptime_ns - self.assertGreater(uptime, 0) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(model_name in ss.model_status, - "expected status for model " + model_name) - for (k, v) in iteritems(ss.model_status[model_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_UNAVAILABLE) - - hctx = ServerHealthContext(pair[0], pair[1], True) - self.assertFalse(hctx.is_ready()) - self.assertTrue(hctx.is_live()) + model_name = tu.get_model_name( + "graphdef", np.float32, np.float32, np.float32 + ) + + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + self.assertTrue(triton_client.is_server_live()) + self.assertFalse(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + self.assertTrue(triton_client.is_server_live()) + self.assertFalse(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) - except InferenceServerException as ex: + # Inferencing with the missing model should fail. + try: + iu.infer_exact( + self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32 + ) + self.assertTrue(False, "expected error for unavailable model " + model_name) + except Exception as ex: + self.assertIn( + "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions", + ex.message(), + ) + + # And other models should be loaded successfully + try: + for base_name in ["savedmodel", "onnx"]: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + model_name = tu.get_model_name( + base_name, np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + + iu.infer_exact( + self, + base_name, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=1, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_parse_error_modelfail_nostrict(self): + # --strict-readiness=false so server is live and ready + tensor_shape = (1, 16) + + # Server was started but with a model that fails to load + try: + model_name = tu.get_model_name( + "graphdef", np.float32, np.float32, np.float32 + ) + + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) + # Inferencing with the missing model should fail. try: - iu.infer_exact(self, 'graphdef', tensor_shape, 1, True, - np.float32, np.float32, np.float32) + iu.infer_exact( + self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32 + ) self.assertTrue(False, "expected error for unavailable model " + model_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith( - "Inference request for unknown model 'graphdef_float32_float32_float32'")) + except Exception as ex: + self.assertIn( + "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions", + ex.message(), + ) + + # And other models should be loaded successfully + try: + for base_name in ["savedmodel", "onnx"]: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + model_name = tu.get_model_name( + base_name, np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + + iu.infer_exact( + self, + base_name, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=1, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_parse_error_no_model_config(self): + tensor_shape = (1, 16) + + # Server was started but with a model that fails to be polled + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + model_name = tu.get_model_name( + "graphdef", np.float32, np.float32, np.float32 + ) + + # expecting ready because not strict readiness + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + + md = triton_client.get_model_metadata(model_name, "1") + self.assertTrue( + False, + "expected model '" + + model_name + + "' to be ignored due to polling failure", + ) + + except Exception as ex: + self.assertIn( + "Request for unknown model: 'graphdef_float32_float32_float32' is not found", + ex.message(), + ) + + # And other models should be loaded successfully + try: + for base_name in ["savedmodel", "onnx"]: + model_name = tu.get_model_name( + base_name, np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + + iu.infer_exact( + self, + base_name, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=1, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_init_error_modelfail(self): + # --strict-readiness=true so server is live but not ready + + # Server was started but with models that fail to load + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + self.assertTrue(triton_client.is_server_live()) + self.assertFalse(triton_client.is_server_ready()) + + # one model uses sequence batcher while the other uses dynamic batcher + model_names = ["onnx_sequence_int32", "onnx_int32_int32_int32"] + for model_name in model_names: + self.assertFalse(triton_client.is_model_ready(model_name)) + + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # And other models should be loaded successfully + try: + for base_name in ["graphdef", "savedmodel", "onnx"]: + model_name = tu.get_model_name( + base_name, np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + try: + tensor_shape = (1, 16) + for base_name in ["graphdef", "savedmodel", "onnx"]: + iu.infer_exact( + self, + base_name, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=1, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_parse_error_model_no_version(self): + # --strict-readiness=true so server is live but not ready + tensor_shape = (1, 16) + + # Server was started but with a model that fails to load + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + self.assertTrue(triton_client.is_server_live()) + self.assertFalse(triton_client.is_server_ready()) + + model_name = tu.get_model_name( + "graphdef", np.float32, np.float32, np.float32 + ) + self.assertFalse(triton_client.is_model_ready(model_name)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Sanity check that other models are loaded properly + try: + for base_name in ["savedmodel", "onnx"]: + model_name = tu.get_model_name( + base_name, np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name)) + for version in ["1", "3"]: + model_name = tu.get_model_name( + "plan", np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name, version)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + try: + for base_name in ["savedmodel", "onnx"]: + iu.infer_exact( + self, + base_name, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + ) + for version in [1, 3]: + iu.infer_exact( + self, + "plan", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=(version == 3), + model_version=version, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + try: + iu.infer_exact( + self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32 + ) + self.assertTrue(False, "expected error for unavailable model " + model_name) + except Exception as ex: + self.assertIn( + "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions", + ex.message(), + ) + + def test_parse_ignore_zero_prefixed_version(self): + tensor_shape = (1, 16) + + # Server was started but only version 1 is loaded + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + + model_name = tu.get_model_name( + "savedmodel", np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + try: + # swap=False for version 1 + iu.infer_exact( + self, + "savedmodel", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=False, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_parse_ignore_non_intergral_version(self): + tensor_shape = (1, 16) + + # Server was started but only version 1 is loaded + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + + model_name = tu.get_model_name( + "savedmodel", np.float32, np.float32, np.float32 + ) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + try: + # swap=False for version 1 + iu.infer_exact( + self, + "savedmodel", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=False, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) def test_dynamic_model_load_unload(self): - input_size = 16 - tensor_shape = (input_size,) - savedmodel_name = tu.get_model_name('savedmodel', np.float32, np.float32, np.float32) - netdef_name = tu.get_model_name('netdef', np.float32, np.float32, np.float32) + tensor_shape = (1, 16) + savedmodel_name = tu.get_model_name( + "savedmodel", np.float32, np.float32, np.float32 + ) + onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32) # Make sure savedmodel model is not in the status (because - # initially it is not in the model store) - try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) - ss = ctx.get_server_status() - self.assertTrue(False, "expected status failure for " + savedmodel_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("no status available for unknown model")) + # initially it is not in the model repository) + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) - # Add savedmodel model to the model store and give it time to + # Add savedmodel model to the model repository and give it time to # load. Make sure that it has a status and is ready. try: shutil.copytree(savedmodel_name, "models/" + savedmodel_name) - time.sleep(5) # wait for model to load - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(savedmodel_name in ss.model_status, - "expected status for model " + savedmodel_name) - for (k, v) in iteritems(ss.model_status[savedmodel_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_READY) - except InferenceServerException as ex: + time.sleep(5) # wait for model to load + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference on the just loaded model try: - iu.infer_exact(self, 'savedmodel', tensor_shape, 1, True, - np.float32, np.float32, np.float32, swap=True) - except InferenceServerException as ex: + iu.infer_exact( + self, + "savedmodel", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + ) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - # Make sure savedmodel has execution stats in the status. - expected_exec_cnt = 0 + # Make sure savedmodel has execution stats try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(savedmodel_name in ss.model_status, - "expected status for model " + savedmodel_name) - self.assertTrue(3 in ss.model_status[savedmodel_name].version_status, - "expected status for version 3 of model " + savedmodel_name) - - version_status = ss.model_status[savedmodel_name].version_status[3] - self.assertEqual(version_status.ready_state, server_status.MODEL_READY) - self.assertGreater(version_status.model_execution_count, 0) - expected_exec_cnt = version_status.model_execution_count - except InferenceServerException as ex: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + stats = triton_client.get_inference_statistics(savedmodel_name) + self.assertEqual(len(stats["model_stats"]), 2) + for idx in range(len(stats["model_stats"])): + self.assertEqual(stats["model_stats"][idx]["name"], savedmodel_name) + if stats["model_stats"][idx]["version"] == "1": + self.assertEqual( + stats["model_stats"][idx]["inference_stats"]["success"][ + "count" + ], + 0, + ) + else: + self.assertNotEqual( + stats["model_stats"][idx]["inference_stats"]["success"][ + "count" + ], + 0, + ) + + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + stats = triton_client.get_inference_statistics(savedmodel_name) + self.assertEqual(len(stats.model_stats), 2) + for idx in range(len(stats.model_stats)): + self.assertEqual(stats.model_stats[idx].name, savedmodel_name) + if stats.model_stats[idx].version == "1": + self.assertEqual( + stats.model_stats[idx].inference_stats.success.count, 0 + ) + else: + self.assertNotEqual( + stats.model_stats[idx].inference_stats.success.count, 0 + ) + + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - # Remove savedmodel model from the model store and give it - # time to unload. Make sure that it has a status but is - # unavailable. + # Remove savedmodel model from the model repository and give it + # time to unload. Make sure that it is no longer available. try: shutil.rmtree("models/" + savedmodel_name) - time.sleep(5) # wait for model to unload - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(savedmodel_name in ss.model_status, - "expected status for model " + savedmodel_name) - self.assertTrue(3 in ss.model_status[savedmodel_name].version_status, - "expected status for version 3 of model " + savedmodel_name) - - version_status = ss.model_status[savedmodel_name].version_status[3] - self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) - self.assertEqual(version_status.model_execution_count, expected_exec_cnt) - except InferenceServerException as ex: + time.sleep(5) # wait for model to unload + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Model is removed so inference should fail try: - iu.infer_exact(self, 'savedmodel', tensor_shape, 1, True, - np.float32, np.float32, np.float32, swap=True) - self.assertTrue(False, "expected error for unavailable model " + savedmodel_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) + iu.infer_exact( + self, + "savedmodel", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + ) self.assertTrue( - ex.message().startswith( - "Inference request for unknown model 'savedmodel_float32_float32_float32'")) + False, "expected error for unavailable model " + savedmodel_name + ) + except Exception as ex: + self.assertIn( + "Request for unknown model: '{}' has no available versions".format( + savedmodel_name + ), + ex.message(), + ) # Add back the same model. The status/stats should be reset. try: shutil.copytree(savedmodel_name, "models/" + savedmodel_name) - time.sleep(5) # wait for model to load - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(savedmodel_name in ss.model_status, - "expected status for model " + savedmodel_name) - for (k, v) in iteritems(ss.model_status[savedmodel_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_READY) - self.assertEqual(v.model_execution_count, 0) - except InferenceServerException as ex: + time.sleep(5) # wait for model to load + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + stats = triton_client.get_inference_statistics(savedmodel_name) + self.assertEqual(len(stats["model_stats"]), 2) + self.assertEqual(stats["model_stats"][0]["name"], savedmodel_name) + self.assertEqual(stats["model_stats"][1]["name"], savedmodel_name) + self.assertEqual( + stats["model_stats"][0]["inference_stats"]["success"]["count"], 0 + ) + self.assertEqual( + stats["model_stats"][1]["inference_stats"]["success"]["count"], 0 + ) + + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + stats = triton_client.get_inference_statistics(savedmodel_name) + self.assertEqual(len(stats.model_stats), 2) + self.assertEqual(stats.model_stats[0].name, savedmodel_name) + self.assertEqual(stats.model_stats[1].name, savedmodel_name) + self.assertEqual(stats.model_stats[0].inference_stats.success.count, 0) + self.assertEqual(stats.model_stats[1].inference_stats.success.count, 0) + + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - # Remove original model from the model store and give it time - # to unload. Make sure that it has a status but is - # unavailable. + # Remove onnx model from the model repository and give it + # time to unload. Make sure that it is unavailable. try: - shutil.rmtree("models/" + netdef_name) - time.sleep(5) # wait for model to unload - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], netdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(netdef_name in ss.model_status, - "expected status for model " + netdef_name) - self.assertTrue(3 in ss.model_status[netdef_name].version_status, - "expected status for version 3 of model " + netdef_name) - - version_status = ss.model_status[netdef_name].version_status[3] - self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) - except InferenceServerException as ex: + shutil.rmtree("models/" + onnx_name) + time.sleep(5) # wait for model to unload + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertFalse(triton_client.is_model_ready(onnx_name, "1")) + self.assertFalse(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Model is removed so inference should fail try: - iu.infer_exact(self, 'netdef', tensor_shape, 1, True, - np.float32, np.float32, np.float32, swap=True) - self.assertTrue(False, "expected error for unavailable model " + netdef_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith( - "Inference request for unknown model 'netdef_float32_float32_float32'")) + iu.infer_exact( + self, + "onnx", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + ) + self.assertTrue(False, "expected error for unavailable model " + onnx_name) + except Exception as ex: + self.assertIn( + "Request for unknown model: 'onnx_float32_float32_float32' has no available versions", + ex.message(), + ) def test_dynamic_model_load_unload_disabled(self): - input_size = 16 - tensor_shape = (input_size,) - savedmodel_name = tu.get_model_name('savedmodel', np.float32, np.float32, np.float32) - netdef_name = tu.get_model_name('netdef', np.float32, np.float32, np.float32) + tensor_shape = (1, 16) + savedmodel_name = tu.get_model_name( + "savedmodel", np.float32, np.float32, np.float32 + ) + onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32) # Make sure savedmodel model is not in the status (because - # initially it is not in the model store) - try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) - ss = ctx.get_server_status() - self.assertTrue(False, "expected status failure for " + savedmodel_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("no status available for unknown model")) + # initially it is not in the model repository) + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) - # Add savedmodel model to the model store and give it time to + # Add savedmodel model to the model repository and give it time to # load. But it shouldn't load because dynamic loading is disabled. try: shutil.copytree(savedmodel_name, "models/" + savedmodel_name) - time.sleep(5) # wait for model to load - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], savedmodel_name, True) - ss = ctx.get_server_status() - self.assertTrue(False, "expected status failure for " + savedmodel_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) - self.assertTrue( - ex.message().startswith("no status available for unknown model")) + time.sleep(5) # wait for model to load + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference which should fail because the model isn't there try: - iu.infer_exact(self, 'savedmodel', tensor_shape, 1, True, - np.float32, np.float32, np.float32, swap=True) - self.assertTrue(False, "expected error for unavailable model " + savedmodel_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) + iu.infer_exact( + self, + "savedmodel", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + ) self.assertTrue( - ex.message().startswith("no status available for unknown model")) - - # Remove one of the original models from the model - # store. Unloading is disabled so it should remain available - # in the status. + False, "expected error for unavailable model " + savedmodel_name + ) + except Exception as ex: + self.assertIn( + "Request for unknown model: 'savedmodel_float32_float32_float32' is not found", + ex.message(), + ) + + # Remove one of the original models from the model repository. + # Unloading is disabled so it should remain available in the status. try: - shutil.rmtree("models/" + netdef_name) - time.sleep(5) # wait for model to unload (but it shouldn't) - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], netdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(netdef_name in ss.model_status, - "expected status for model " + netdef_name) - self.assertTrue(3 in ss.model_status[netdef_name].version_status, - "expected status for version 3 of model " + netdef_name) - - version_status = ss.model_status[netdef_name].version_status[3] - self.assertEqual(version_status.ready_state, server_status.MODEL_READY) - - except InferenceServerException as ex: + shutil.rmtree("models/" + onnx_name) + time.sleep(5) # wait for model to unload (but it shouldn't) + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference to make sure model still being served even - # though deleted from model store + # though deleted from model repository try: - iu.infer_exact(self, 'netdef', tensor_shape, 1, True, - np.float32, np.float32, np.float32, swap=True) - except InferenceServerException as ex: + iu.infer_exact( + self, + "onnx", + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + ) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_dynamic_version_load_unload(self): - input_size = 16 - tensor_shape = (input_size,) - graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32, np.int32) + tensor_shape = (1, 16) + graphdef_name = tu.get_model_name("graphdef", np.int32, np.int32, np.int32) # There are 3 versions. Make sure that all have status and are # ready. try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(graphdef_name in ss.model_status, - "expected status for model " + graphdef_name) - self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) - for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_READY) - except InferenceServerException as ex: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "1")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "2")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference on version 1 to make sure it is available try: - iu.infer_exact(self, 'graphdef', tensor_shape, 1, True, - np.int32, np.int32, np.int32, swap=False, - model_version=1) - except InferenceServerException as ex: + iu.infer_exact( + self, + "graphdef", + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + swap=False, + model_version=1, + ) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - # Make sure version 1 has execution stats in the status. - expected_exec_cnt = 0 + # Make sure only version 1 has execution stats in the status. try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(graphdef_name in ss.model_status, - "expected status for model " + graphdef_name) - self.assertTrue(1 in ss.model_status[graphdef_name].version_status, - "expected status for version 1 of model " + graphdef_name) - - version_status = ss.model_status[graphdef_name].version_status[1] - self.assertEqual(version_status.ready_state, server_status.MODEL_READY) - self.assertGreater(version_status.model_execution_count, 0) - expected_exec_cnt = version_status.model_execution_count - except InferenceServerException as ex: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + stats = triton_client.get_inference_statistics(graphdef_name) + self.assertEqual(len(stats["model_stats"]), 3) + for idx in range(len(stats["model_stats"])): + self.assertEqual(stats["model_stats"][idx]["name"], graphdef_name) + if stats["model_stats"][idx]["version"] == "1": + self.assertNotEqual( + stats["model_stats"][idx]["inference_stats"]["success"][ + "count" + ], + 0, + ) + else: + self.assertEqual( + stats["model_stats"][idx]["inference_stats"]["success"][ + "count" + ], + 0, + ) + + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + stats = triton_client.get_inference_statistics(graphdef_name) + self.assertEqual(len(stats.model_stats), 3) + for idx in range(len(stats.model_stats)): + self.assertEqual(stats.model_stats[idx].name, graphdef_name) + if stats.model_stats[idx].version == "1": + self.assertNotEqual( + stats.model_stats[idx].inference_stats.success.count, 0 + ) + else: + self.assertEqual( + stats.model_stats[idx].inference_stats.success.count, 0 + ) + + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - # Remove version 1 from the model store and give it time to - # unload. Make sure that it has a status but is unavailable. + # Remove version 1 from the model repository and give it time to + # unload. Make sure that it is unavailable. try: shutil.rmtree("models/" + graphdef_name + "/1") - time.sleep(5) # wait for version to unload - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(graphdef_name in ss.model_status, - "expected status for model " + graphdef_name) - self.assertTrue(1 in ss.model_status[graphdef_name].version_status, - "expected status for version 1 of model " + graphdef_name) - - version_status = ss.model_status[graphdef_name].version_status[1] - self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) - self.assertEqual(version_status.model_execution_count, expected_exec_cnt) - except InferenceServerException as ex: + time.sleep(5) # wait for version to unload + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(graphdef_name, "1")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "2")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Version is removed so inference should fail try: - iu.infer_exact(self, 'graphdef', tensor_shape, 1, True, - np.int32, np.int32, np.int32, swap=False, - model_version=1) - self.assertTrue(False, "expected error for unavailable model " + graphdef_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) + iu.infer_exact( + self, + "graphdef", + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + swap=False, + model_version=1, + ) self.assertTrue( - ex.message().startswith( - "Inference request for unknown model 'graphdef_int32_int32_int32'")) - - # Add back the same version. The status/stats should be - # retained for versions (note that this is different behavior - # than if a model is removed and then added back). - try: - shutil.copytree("models/" + graphdef_name + "/2", - "models/" + graphdef_name + "/1") - time.sleep(5) # wait for model to load - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(graphdef_name in ss.model_status, - "expected status for model " + graphdef_name) - self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) - for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_READY) - if k == 1: - self.assertEqual(v.model_execution_count, expected_exec_cnt) - else: - self.assertEqual(v.model_execution_count, 0) - except InferenceServerException as ex: - self.assertTrue(False, "unexpected error {}".format(ex)) - - # Add another version from the model store. + False, "expected error for unavailable model " + graphdef_name + ) + except Exception as ex: + self.assertIn( + "Request for unknown model: 'graphdef_int32_int32_int32' version 1 is not at ready state", + ex.message(), + ) + + # Add another version to the model repository. try: - shutil.copytree("models/" + graphdef_name + "/2", - "models/" + graphdef_name + "/7") - time.sleep(5) # wait for version to load - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(graphdef_name in ss.model_status, - "expected status for model " + graphdef_name) - self.assertTrue(7 in ss.model_status[graphdef_name].version_status, - "expected status for version 7 of model " + graphdef_name) - - self.assertEqual(len(ss.model_status[graphdef_name].version_status), 4) - for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_READY) - except InferenceServerException as ex: + shutil.copytree( + "models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7" + ) + time.sleep(5) # wait for version to load + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(graphdef_name, "1")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "2")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "3")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "7")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_dynamic_version_load_unload_disabled(self): - input_size = 16 - tensor_shape = (input_size,) - graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32, np.int32) + tensor_shape = (1, 16) + graphdef_name = tu.get_model_name("graphdef", np.int32, np.int32, np.int32) - # Add a new version to the model store and give it time to + # Add a new version to the model repository and give it time to # load. But it shouldn't load because dynamic loading is # disabled. try: - shutil.copytree("models/" + graphdef_name + "/2", - "models/" + graphdef_name + "/7") - time.sleep(5) # wait for model to load - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(graphdef_name in ss.model_status, - "expected status for model " + graphdef_name) - self.assertFalse(7 in ss.model_status[graphdef_name].version_status, - "unexpected status for version 7 of model " + graphdef_name) - self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) - except InferenceServerException as ex: + shutil.copytree( + "models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7" + ) + time.sleep(5) # wait for model to load + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "1")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "2")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "3")) + self.assertFalse(triton_client.is_model_ready(graphdef_name, "7")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - # Remove one of the original versions from the model - # store. Unloading is disabled so it should remain available + # Remove one of the original versions from the model repository. + # Unloading is disabled so it should remain available # in the status. try: shutil.rmtree("models/" + graphdef_name + "/1") - time.sleep(5) # wait for version to unload (but it shouldn't) - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], graphdef_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(graphdef_name in ss.model_status, - "expected status for model " + graphdef_name) - self.assertTrue(1 in ss.model_status[graphdef_name].version_status, - "expected status for version 1 of model " + graphdef_name) - - self.assertEqual(len(ss.model_status[graphdef_name].version_status), 3) - for (k, v) in iteritems(ss.model_status[graphdef_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_READY) - except InferenceServerException as ex: + time.sleep(5) # wait for version to unload (but it shouldn't) + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "1")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "2")) + self.assertTrue(triton_client.is_model_ready(graphdef_name, "3")) + self.assertFalse(triton_client.is_model_ready(graphdef_name, "7")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference to make sure model still being served even - # though version deleted from model store + # though version deleted from model repository try: - iu.infer_exact(self, 'graphdef', tensor_shape, 1, True, - np.int32, np.int32, np.int32, swap=False, - model_version=1) - except InferenceServerException as ex: + iu.infer_exact( + self, + "graphdef", + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + swap=False, + model_version=1, + ) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_dynamic_model_modify(self): - input_size = 16 - models_base = ('savedmodel', 'plan') - models_shape = ((input_size,), (input_size, 1, 1)) + models_base = ("savedmodel", "plan") + models_shape = ((1, 16), (1, 16)) models = list() for m in models_base: models.append(tu.get_model_name(m, np.float32, np.float32, np.float32)) @@ -584,81 +1052,2574 @@ def test_dynamic_model_modify(self): # Make sure savedmodel and plan are in the status for model_name in models: try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], model_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(model_name in ss.model_status, - "expected status for model " + model_name) - for (k, v) in iteritems(ss.model_status[model_name].version_status): - self.assertEqual(v.ready_state, server_status.MODEL_READY) - except InferenceServerException as ex: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Run inference on the model, both versions 1 and 3 for version in (1, 3): for model_name, model_shape in zip(models_base, models_shape): try: - iu.infer_exact(self, model_name, model_shape, 1, True, - np.float32, np.float32, np.float32, swap=(version == 3), - model_version=version) - except InferenceServerException as ex: + iu.infer_exact( + self, + model_name, + model_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=(version == 3), + model_version=version, + ) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - # Change the model configuration to have the default version - # policy (so that only version 3) if available. + # Change the model configuration to use wrong label file for base_name, model_name in zip(models_base, models): - shutil.copyfile("config.pbtxt." + base_name, "models/" + model_name + "/config.pbtxt") + shutil.copyfile( + "config.pbtxt.wrong." + base_name, + "models/" + model_name + "/config.pbtxt", + ) - time.sleep(5) # wait for models to reload + time.sleep(5) # wait for models to reload + for model_name in models: + for model_name, model_shape in zip(models_base, models_shape): + try: + iu.infer_exact( + self, + model_name, + model_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=(version == 3), + model_version=version, + output0_raw=False, + ) + self.assertTrue( + False, "expected error for wrong label for " + model_name + ) + except AssertionError as ex: + self.assertTrue("'label9" in str(ex) and "!=" in str(ex), str(ex)) + + # Change the model configuration to use correct label file and to have + # the default version policy (so that only version 3) is available. + for base_name, model_name in zip(models_base, models): + shutil.copyfile( + "config.pbtxt." + base_name, "models/" + model_name + "/config.pbtxt" + ) + + time.sleep(5) # wait for models to reload for model_name in models: try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - ctx = ServerStatusContext(pair[0], pair[1], model_name, True) - ss = ctx.get_server_status() - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], ss.version) - self.assertEqual("inference:0", ss.id) - self.assertEqual(server_status.SERVER_READY, ss.ready_state) - self.assertEqual(len(ss.model_status), 1) - self.assertTrue(model_name in ss.model_status, - "expected status for model " + model_name) - self.assertTrue(1 in ss.model_status[model_name].version_status, - "expected status for version 1 of model " + model_name) - self.assertTrue(3 in ss.model_status[model_name].version_status, - "expected status for version 3 of model " + model_name) - self.assertEqual(ss.model_status[model_name].version_status[1].ready_state, - server_status.MODEL_UNAVAILABLE) - self.assertEqual(ss.model_status[model_name].version_status[3].ready_state, - server_status.MODEL_READY) - except InferenceServerException as ex: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Attempt inferencing using version 1, should fail since # change in model policy makes that no longer available. for model_name, model_shape in zip(models_base, models_shape): try: - iu.infer_exact(self, model_name, model_shape, 1, True, - np.float32, np.float32, np.float32, swap=False, - model_version=1) - self.assertTrue(False, "expected error for unavailable model " + model_name) - except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) + iu.infer_exact( + self, + model_name, + model_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=False, + model_version=1, + ) self.assertTrue( - ex.message().startswith("Inference request for unknown model")) + False, "expected error for unavailable model " + model_name + ) + except Exception as ex: + self.assertIn("Request for unknown model", ex.message()) # Version 3 should continue to work... for model_name, model_shape in zip(models_base, models_shape): try: - iu.infer_exact(self, model_name, model_shape, 1, True, - np.float32, np.float32, np.float32, swap=True, - model_version=3) - except InferenceServerException as ex: + iu.infer_exact( + self, + model_name, + model_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + model_version=3, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_dynamic_file_delete(self): + models_base = ("savedmodel", "plan") + models_shape = ((1, 16), (1, 16)) + models = list() + for m in models_base: + models.append(tu.get_model_name(m, np.float32, np.float32, np.float32)) + + # Make sure savedmodel and plan are in the status + for model_name in models: + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Run inference on the model, both versions 1 and 3 + for version in (1, 3): + for model_name, model_shape in zip(models_base, models_shape): + try: + iu.infer_exact( + self, + model_name, + model_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=(version == 3), + model_version=version, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Delete model configuration, which cause model to be + # re-loaded and use autofilled config, which means that + # version policy will be latest and so only version 3 will be + # available + for model_name in models: + os.remove("models/" + model_name + "/config.pbtxt") + + time.sleep(5) # wait for models to reload + for model_name in models: + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Only version 3 (latest) should work... + for model_name, model_shape in zip(models_base, models_shape): + try: + iu.infer_exact( + self, + model_name, + model_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=True, + model_version=3, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + try: + iu.infer_exact( + self, + model_name, + model_shape, + 1, + np.float32, + np.float32, + np.float32, + swap=False, + model_version=1, + ) + self.assertTrue( + False, "expected error for unavailable model " + model_name + ) + except Exception as ex: + self.assertIn("Request for unknown model", ex.message()) + + def test_multiple_model_repository_polling(self): + model_shape = (1, 16) + savedmodel_name = tu.get_model_name( + "savedmodel", np.float32, np.float32, np.float32 + ) + + # Models should be loaded successfully and infer + # successfully. Initially savedmodel only has version 1. + self._infer_success_models( + [ + "savedmodel", + ], + (1,), + model_shape, + ) + self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape) + + # Add the savedmodel to the second model repository, should cause + # it to be unloaded due to duplication + shutil.copytree(savedmodel_name, "models_0/" + savedmodel_name) + time.sleep(5) # wait for models to reload + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1")) + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape) + + # Remove the savedmodel from the first model repository, the + # model from the second model repository should be loaded + # properly. In the second model repository savedmodel should + # have versions 1 and 3. + shutil.rmtree("models/" + savedmodel_name) + time.sleep(5) # wait for model to unload + self._infer_success_models( + ["savedmodel", "graphdef", "onnx"], (1, 3), model_shape + ) + + def test_multiple_model_repository_control(self): + # similar to test_multiple_model_repository_polling, but the + # model load/unload is controlled by the API + model_shape = (1, 16) + savedmodel_name = tu.get_model_name( + "savedmodel", np.float32, np.float32, np.float32 + ) + model_bases = ["savedmodel", "graphdef", "onnx"] + + # Initially models are not loaded + for base in model_bases: + try: + model_name = tu.get_model_name(base, np.float32, np.float32, np.float32) + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Load all models, here we use GRPC + for base in model_bases: + try: + model_name = tu.get_model_name(base, np.float32, np.float32, np.float32) + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + triton_client.load_model(model_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Models should be loaded successfully and infer + # successfully. Initially savedmodel only has version 1. + self._infer_success_models( + [ + "savedmodel", + ], + (1,), + model_shape, + ) + self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape) + + # Add the savedmodel to the second model repository. Because + # not polling this doesn't change any model state, all models + # are still loaded and available. + shutil.copytree(savedmodel_name, "models_0/" + savedmodel_name) + self._infer_success_models( + [ + "savedmodel", + ], + (1,), + model_shape, + ) + self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape) + + # Load savedmodel again which should fail because it is now duplicated + # in 2 model repositories. Use HTTP here. + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(savedmodel_name) + except Exception as ex: + self.assertIn("failed to load '{}'".format(savedmodel_name), ex.message()) + + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + # Unlike polling mode, the failed load on the duplicate model + # should NOT unload the existing versions in model control mode. + self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1")) + # Version 3 did not exist in the first model repository, so + # it should still not be loaded. + self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape) + + # Remove the savedmodel from the first model repository and + # explicitly load savedmodel. The savedmodel from the second + # model repository should be loaded properly. In the second + # model repository savedmodel should have versions 1 and 3. + shutil.rmtree("models/" + savedmodel_name) + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + # Unload existing in-memory model from first model repository + triton_client.unload_model(savedmodel_name) + # Load model from second model repository since original was deleted + triton_client.load_model(savedmodel_name) + except Exception as ex: + self.assertIn("failed to load '{}'".format(savedmodel_name), ex.message()) + + self._infer_success_models( + ["savedmodel", "graphdef", "onnx"], (1, 3), model_shape + ) + + def test_model_control(self): + model_shape = (1, 16) + onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32) + + ensemble_prefix = "simple_" + ensemble_name = ensemble_prefix + onnx_name + + # Make sure no models are loaded + for model_name in (onnx_name, ensemble_name): + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Load non-existent model + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + triton_client.load_model("unknown_model") + self.assertTrue(False, "expected unknown model failure") + except Exception as ex: + self.assertIn( + "failed to load 'unknown_model', failed to poll from model repository", + ex.message(), + ) + + # Load ensemble model, the dependent model should be polled and loaded + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(ensemble_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "onnx", + ], + (1, 3), + model_shape, + ) + self._infer_success_models( + [ + "simple_onnx", + ], + (1, 3), + model_shape, + swap=True, + ) + + # Delete model configuration for onnx, which will cause + # the autofiller to use the latest version policy so that only + # version 3 will be available if the models are re-loaded + for model_name in (onnx_name,): + os.remove("models/" + model_name + "/config.pbtxt") + + self._infer_success_models( + [ + "onnx", + ], + (1, 3), + model_shape, + ) + self._infer_success_models( + [ + "simple_onnx", + ], + (1, 3), + model_shape, + swap=True, + ) + + # Reload models, only version 3 should be available for onnx + for model_name in (onnx_name, ensemble_name): + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + triton_client.load_model(model_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "onnx", + ], + (3,), + model_shape, + ) + self._infer_success_models( + [ + "simple_onnx", + ], + (1, 3), + model_shape, + swap=True, + ) + + for model_name in (onnx_name,): + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Unload non-existing model, nothing should happen + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + triton_client.unload_model("unknown_model") + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Unload the depending model, as side effect, the ensemble model will be + # forced to be unloaded + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.unload_model(onnx_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + for model_name in (onnx_name, ensemble_name): + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Explicitly unload the ensemble and load the depending + # model. The ensemble model should not be reloaded because it + # was explicitly unloaded. + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.unload_model(ensemble_name) + triton_client.load_model(onnx_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "onnx", + ], + (3,), + model_shape, + ) + + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(ensemble_name, "1")) + self.assertFalse(triton_client.is_model_ready(ensemble_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_model_control_fail(self): + model_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32) + + # Make sure no models are loaded + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Request to load the model and expect fail to load + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(model_name) + self.assertTrue(False, "expecting load failure") + except InferenceServerException as ex: + self.assertIn("load failed for model '{}'".format(model_name), ex.message()) + + # Another attempt should fail as well + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(model_name) + self.assertTrue(False, "expecting load failure") + except InferenceServerException as ex: + self.assertIn("load failed for model '{}'".format(model_name), ex.message()) + + def test_model_control_ensemble(self): + model_shape = (1, 16) + onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32) + + ensemble_prefix = "simple_" + ensemble_name = ensemble_prefix + onnx_name + + # Make sure no models are loaded + for model_name in (onnx_name, ensemble_name): + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Load ensemble model, the dependent model should be polled and loaded + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(ensemble_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "onnx", + ], + (1, 3), + model_shape, + ) + self._infer_success_models( + [ + "simple_onnx", + ], + (1, 3), + model_shape, + swap=True, + ) + + # Unload the ensemble with unload_dependents flag. all models should be unloaded + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.unload_model(ensemble_name, unload_dependents=True) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + for model_name in (onnx_name, ensemble_name): + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Load ensemble model, and unload it without unload_dependents flag (default). + # The dependent model should still be available + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(ensemble_name) + triton_client.unload_model(ensemble_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "onnx", + ], + (1, 3), + model_shape, + ) + + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(ensemble_name, "1")) + self.assertFalse(triton_client.is_model_ready(ensemble_name, "3")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "1")) + self.assertTrue(triton_client.is_model_ready(onnx_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_load_same_model_different_platform(self): + model_shape = (1, 16) + model_name = tu.get_model_name("simple", np.float32, np.float32, np.float32) + + # Check whether or not to use grpc protocol + use_grpc = "TRITONSERVER_USE_GRPC" in os.environ + + # Make sure version 1 and 3 of the model are loaded + # and the model platform is TensorRT + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + if use_grpc: + metadata = triton_client.get_model_metadata(model_name, as_json=True) + else: + metadata = triton_client.get_model_metadata(model_name) + self.assertEqual(metadata["platform"], "tensorrt_plan") + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_models( + [ + "simple", + ], + ( + 1, + 3, + ), + model_shape, + ) + + # Copy the same model of different platform to model repository + shutil.rmtree("models/" + model_name) + shutil.copytree(model_name, "models/" + model_name) + + # Reload models + try: + triton_client = self._get_client(use_grpc) + triton_client.load_model(model_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Make sure version 1 and 3 of the model are loaded + # and the model platform is PyTorch + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + if use_grpc: + metadata = triton_client.get_model_metadata(model_name, as_json=True) + else: + metadata = triton_client.get_model_metadata(model_name) + self.assertEqual(metadata["platform"], "pytorch_libtorch") + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_models( + [ + "simple", + ], + ( + 1, + 3, + ), + model_shape, + ) + + def test_model_availability_on_reload(self): + model_name = "identity_zero_1_int32" + model_base = "identity" + model_shape = (16,) + + # Check whether or not to use grpc protocol + use_grpc = "TRITONSERVER_USE_GRPC" in os.environ + + # Make sure version 1 of the model is loaded + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + # Create a new version for reload + os.mkdir("models/" + model_name + "/2") + + # Reload models, v1 should still be available until v2 is loaded + # The load is requested in other thread as it is blocking API, + # and the v1 availability should be tested during the reload + thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc)) + thread.start() + # wait for time < model creation delay to ensure load request is sent + time.sleep(3) + load_start = time.time() + + # Make sure version 1 of the model is still available + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + load_end = time.time() + self.assertTrue( + (load_end - load_start) < 5, + "server was waiting unexpectedly, waited {}".format( + (load_end - load_start) + ), + ) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + thread.join() + # Make sure version 2 of the model is available while version 1 is not + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "2")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (2,), np.int32, model_shape) + + def test_model_availability_on_reload_2(self): + model_name = "identity_zero_1_int32" + model_base = "identity" + model_shape = (16,) + + # Check whether or not to use grpc protocol + use_grpc = "TRITONSERVER_USE_GRPC" in os.environ + + # Make sure version 1 of the model is loaded + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + # Overwrite config.pbtxt to load v2 only + shutil.copyfile("config.pbtxt.v2", "models/" + model_name + "/config.pbtxt") + + # Reload models, v1 should still be available until v2 is loaded + # The load is requested in other thread as it is blocking API, + # and the v1 availability should be tested during the reload + thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc)) + thread.start() + # wait for time < model creation delay to ensure load request is sent + time.sleep(3) + load_start = time.time() + + # Make sure version 1 of the model is still available + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + load_end = time.time() + self.assertTrue( + (load_end - load_start) < 5, + "server was waiting unexpectedly, waited {}".format( + (load_end - load_start) + ), + ) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + thread.join() + # Make sure version 2 of the model is available while version 1 is not + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "2")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (2,), np.int32, model_shape) + + def test_model_availability_on_reload_3(self): + model_name = "identity_zero_1_int32" + model_base = "identity" + model_shape = (16,) + + # Check whether or not to use grpc protocol + use_grpc = "TRITONSERVER_USE_GRPC" in os.environ + + # Make sure version 1 of the model is loaded + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + # Overwrite config.pbtxt to load v2 only + shutil.copyfile("config.pbtxt.new", "models/" + model_name + "/config.pbtxt") + + # Reload models, v1 will be reloaded but it should be available + # during the whole reload + thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc)) + thread.start() + # wait for time < model creation delay to ensure load request is sent + time.sleep(3) + load_start = time.time() + + # Make sure version 1 of the model is still available + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + load_end = time.time() + self.assertTrue( + (load_end - load_start) < 5, + "server was waiting unexpectedly, waited {}".format( + (load_end - load_start) + ), + ) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + thread.join() + # Make sure version 1 of the model is still available after reload + try: + triton_client = self._get_client(use_grpc) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + def test_model_reload_fail(self): + model_name = "identity_zero_1_int32" + model_base = "identity" + model_shape = (16,) + + # Make sure version 1 of the model is loaded + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + # Overwrite config.pbtxt to load v2 only on GPU, which will fail + shutil.copyfile("config.pbtxt.v2.gpu", "models/" + model_name + "/config.pbtxt") + + # Reload models, v1 should still be available even if v2 fails to load + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(model_name) + self.assertTrue(False, "expecting load failure") + except Exception as ex: + self.assertIn( + "version 2 is at UNAVAILABLE state: Internal: GPU instances not supported", + ex.message(), + ) + + # Make sure version 1 of the model is available, and version 2 is not + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self._infer_success_identity(model_base, (1,), np.int32, model_shape) + + def test_multiple_model_repository_control_startup_models(self): + model_shape = (1, 16) + onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32) + plan_name = tu.get_model_name("plan", np.float32, np.float32, np.float32) + + ensemble_prefix = "simple_" + onnx_ensemble_name = ensemble_prefix + onnx_name + plan_ensemble_name = ensemble_prefix + plan_name + + # Make sure unloaded models are not in the status + for base in ("savedmodel",): + model_name = tu.get_model_name(base, np.float32, np.float32, np.float32) + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # And loaded models work properly + self._infer_success_models( + [ + "onnx", + ], + (1, 3), + model_shape, + ) + self._infer_success_models( + [ + "simple_onnx", + ], + (1, 3), + model_shape, + swap=True, + ) + self._infer_success_models( + [ + "plan", + ], + (1, 3), + model_shape, + ) + + # Load non-existing model + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + triton_client.load_model("unknown_model") + self.assertTrue(False, "expected unknown model failure") + except Exception as ex: + self.assertIn( + "failed to load 'unknown_model', failed to poll from model repository", + ex.message(), + ) + + # Load plan ensemble model, the dependent model is already + # loaded via command-line + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.load_model(plan_ensemble_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "plan", + ], + (1, 3), + model_shape, + ) + self._infer_success_models( + [ + "simple_plan", + ], + (1, 3), + model_shape, + swap=True, + ) + + # Delete model configuration, which will cause the autofiller + # to use the latest version policy so that only version 3 will + # be available if the models are re-loaded + os.remove("models/" + onnx_name + "/config.pbtxt") + + self._infer_success_models( + [ + "plan", + ], + (1, 3), + model_shape, + ) + self._infer_success_models( + [ + "simple_plan", + ], + (1, 3), + model_shape, + swap=True, + ) + + # Reload onnx, only version 3 should be available + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + triton_client.load_model(onnx_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "onnx", + ], + (3,), + model_shape, + ) + self._infer_success_models( + [ + "simple_onnx", + ], + (1, 3), + model_shape, + swap=True, + ) + + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(onnx_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Unload non-existing model, nothing should happen + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + triton_client.unload_model("unknown_model") + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Unload the onnx, as side effect, the ensemble model + # will be forced to be unloaded + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.unload_model(onnx_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + for model_name in [onnx_name, onnx_ensemble_name]: + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Explicitly unload the onnx ensemble and load the + # depending model. The ensemble model should not be reloaded + # because it was explicitly unloaded. + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + triton_client.unload_model(onnx_ensemble_name) + triton_client.load_model(onnx_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + self._infer_success_models( + [ + "onnx", + ], + (3,), + model_shape, + ) + self._infer_success_models( + [ + "plan", + ], + (1, 3), + model_shape, + ) + self._infer_success_models( + [ + "simple_plan", + ], + (1, 3), + model_shape, + swap=True, + ) + + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(onnx_ensemble_name, "1")) + self.assertFalse(triton_client.is_model_ready(onnx_ensemble_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_model_repository_index(self): + # use model control EXPLICIT and --load-model to load a subset of models + # in model repository + tensor_shape = (1, 16) + model_bases = ["graphdef", "savedmodel", "simple_savedmodel"] + + # Sanity check on loaded models + # 3 models should be loaded: + # simple_savedmodel_float32_float32_float32 + # savedmodel_float32_float32_float32 + # graphdef_float32_float32_float32 + for model_base in model_bases: + try: + model_name = tu.get_model_name( + model_base, np.float32, np.float32, np.float32 + ) + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Check model repository index + # All models should be in ready state except onnx_float32_float32_float32 + # which appears in two repositories. + model_bases.append("simple_graphdef") + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + index = triton_client.get_model_repository_index() + indexed = list() + self.assertEqual(len(index), 8) + for i in index: + indexed.append(i["name"]) + if i["name"] == "onnx_float32_float32_float32": + self.assertEqual(i["state"], "UNAVAILABLE") + self.assertEqual( + i["reason"], "model appears in two or more repositories" + ) + for model_base in model_bases: + model_name = tu.get_model_name( + model_base, np.float32, np.float32, np.float32 + ) + self.assertTrue(model_name in indexed) + + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + index = triton_client.get_model_repository_index() + indexed = list() + self.assertEqual(len(index.models), 8) + for i in index.models: + indexed.append(i.name) + if i.name == "onnx_float32_float32_float32": + self.assertEqual(i.state, "UNAVAILABLE") + self.assertEqual( + i.reason, "model appears in two or more repositories" + ) + for model_base in model_bases: + model_name = tu.get_model_name( + model_base, np.float32, np.float32, np.float32 + ) + self.assertTrue(model_name in indexed) + + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_config_override(self): + model_shape = (1, 16) + + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + for base in (("onnx", "onnxruntime"),): + model_name = tu.get_model_name( + base[0], np.float32, np.float32, np.float32 + ) + try: + self.assertTrue(triton_client.is_server_live()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Request to load the model as is and expect the model fails + # to load with default config + try: + triton_client.load_model(model_name) + self.assertTrue( + False, "expected fail to load '{}'".format(model_name) + ) + except Exception as ex: + self.assertIn( + "load failed for model '{}'".format(model_name), ex.message() + ) + + # Request to load the model with provided "correct" config + try: + triton_client.load_model( + model_name, + config=""" +{{"backend":"{backend}","version_policy":{{"specific" : {{ "versions": [2] }} }} }} +""".format( + backend=base[1] + ), + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "2")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + + # And loaded models work properly + self._infer_success_models( + [ + base[0], + ], + (2,), + model_shape, + ) + + # request without additional config will load retain the provided + # config and expect to not fail, and version 2 will not be loaded. + try: + triton_client.load_model(model_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "2")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + + # Unload model for the next client iteration + try: + triton_client.unload_model(model_name) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_file_override(self): + model_shape = (1, 16) + override_base = "override_model" + + for base in (("onnx", "onnxruntime"),): + model_name = tu.get_model_name(base[0], np.float32, np.float32, np.float32) + override_model_name = tu.get_model_name( + override_base, np.float32, np.float32, np.float32 + ) + + # Prepare override file + with open("models/{}/3/model.{}".format(model_name, base[0]), "rb") as f: + file_content = f.read() + + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + try: + self.assertTrue(triton_client.is_server_live()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Request to load the model with override file, should fail + # without providing override config. The config requirement + # serves as an reminder that the existing model directory will + # not be used. + try: + triton_client.load_model( + model_name, files={"file:1/model.onnx": file_content} + ) + self.assertTrue(False, "expected error on missing override config") + except InferenceServerException as ex: + # [FIXME] Improve error reporting to mention missing config + self.assertIn( + "failed to load '{}', failed to poll from model repository".format( + model_name + ), + ex.message(), + ) + + # Sanity check on previous loaded version is still available + # after the failure attempt to load model with different version + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + + self._infer_success_models( + [ + base[0], + ], + (3,), + model_shape, + ) + + # Request to load the model with override file and config in + # a different name + try: + triton_client.load_model( + override_model_name, + config="""{{"backend":"{backend}" }}""".format(backend=base[1]), + files={"file:1/model.onnx": file_content}, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Sanity check on previous loaded version is still available + # after the load with different model name + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + self._infer_success_models( + [ + base[0], + ], + (3,), + model_shape, + ) + + # New override model should also be available + self.assertTrue(triton_client.is_model_ready(override_model_name, "1")) + self.assertFalse(triton_client.is_model_ready(override_model_name, "2")) + self.assertFalse(triton_client.is_model_ready(override_model_name, "3")) + self._infer_success_models( + [ + override_base, + ], + (1,), + model_shape, + swap=True, + ) + + # Request to load the model with override file and config in + # original name + try: + triton_client.load_model( + model_name, + config="""{{"backend":"{backend}" }}""".format(backend=base[1]), + files={"file:1/model.onnx": file_content}, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # The model should be loaded from the override model directory + # which has different model version + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + self._infer_success_models( + [ + base[0], + ], + (1,), + model_shape, + swap=True, + ) + + # The model with different name should be available + self.assertTrue(triton_client.is_model_ready(override_model_name, "1")) + self.assertFalse(triton_client.is_model_ready(override_model_name, "2")) + self.assertFalse(triton_client.is_model_ready(override_model_name, "3")) + self._infer_success_models( + [ + override_base, + ], + (1,), + model_shape, + swap=True, + ) + + # Reset model for the next client iteration + try: + # Unload and load the model again and the original model repository will + # be used + triton_client.unload_model(model_name) + triton_client.load_model(model_name) + triton_client.unload_model(override_model_name) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + self._infer_success_models( + [ + base[0], + ], + (3,), + model_shape, + ) + + # Test that model load API file override can't be used to create files + # outside of any model directory. + def test_file_override_security(self): + # When using model load API, temporary model directories are created in + # a randomly generated /tmp/folderXXXXXX directory for the life of the + # model, and cleaned up on model unload. + model_basepath = "/tmp/folderXXXXXX" + if os.path.exists(model_basepath) and os.path.isdir(model_basepath): + shutil.rmtree(model_basepath) + os.makedirs(model_basepath) + + # Set file override paths that try to escape out of model directory, + # and test both pre-existing and non-existent files. + root_home_dir = "/root" + + # Relative paths + escape_dir_rel = os.path.join("..", "..", "root") + escape_dir_full = os.path.join(model_basepath, escape_dir_rel) + self.assertEqual(os.path.abspath(escape_dir_full), root_home_dir) + + new_file_rel = os.path.join(escape_dir_rel, "new_dir", "test.txt") + self.assertFalse(os.path.exists(os.path.join(model_basepath, new_file_rel))) + existing_file_rel = os.path.join(escape_dir_rel, ".bashrc") + self.assertTrue(os.path.exists(os.path.join(model_basepath, existing_file_rel))) + + # Symlinks + ## No easy way to inject symlink into generated temp model dir, so for + ## testing sake, make a fixed symlink path in /tmp. + escape_dir_symlink_rel = os.path.join("..", "escape_symlink") + escape_dir_symlink_full = "/tmp/escape_symlink" + self.assertEqual( + os.path.abspath(os.path.join(model_basepath, escape_dir_symlink_rel)), + escape_dir_symlink_full, + ) + if os.path.exists(escape_dir_symlink_full): + os.unlink(escape_dir_symlink_full) + os.symlink(root_home_dir, escape_dir_symlink_full) + self.assertTrue(os.path.abspath(escape_dir_symlink_full), root_home_dir) + + symlink_new_file_rel = os.path.join( + escape_dir_symlink_rel, "new_dir", "test.txt" + ) + self.assertFalse( + os.path.exists(os.path.join(model_basepath, symlink_new_file_rel)) + ) + symlink_existing_file_rel = os.path.join(escape_dir_symlink_rel, ".bashrc") + self.assertTrue( + os.path.exists(os.path.join(model_basepath, symlink_existing_file_rel)) + ) + + # Contents to try writing to file, though it should fail to be written + new_contents = "This shouldn't exist" + new_contents_b64 = base64.b64encode(new_contents.encode()) + + new_files = [new_file_rel, symlink_new_file_rel] + existing_files = [existing_file_rel, symlink_existing_file_rel] + all_files = new_files + existing_files + for filepath in all_files: + # minimal config to create a new model + config = json.dumps({"backend": "identity"}) + files = {f"file:{filepath}": new_contents_b64} + with httpclient.InferenceServerClient("localhost:8000") as client: + with self.assertRaisesRegex(InferenceServerException, "failed to load"): + client.load_model("new_model", config=config, files=files) + + for rel_path in new_files: + # Assert new file wasn't created + self.assertFalse(os.path.exists(os.path.join(model_basepath, rel_path))) + + for rel_path in existing_files: + # Read the existing file and make sure it's contents weren't overwritten + existing_file = os.path.join(model_basepath, rel_path) + self.assertTrue(os.path.exists(existing_file)) + with open(existing_file) as f: + contents = f.read() + self.assertNotEqual(contents, new_contents) + + def test_shutdown_dynamic(self): + model_shape = (1, 1) + input_data = np.ones(shape=(1, 1), dtype=np.float32) + + inputs = [grpcclient.InferInput("INPUT0", model_shape, "FP32")] + inputs[0].set_data_from_numpy(input_data) + + triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True) + model_name = "custom_zero_1_float32" + + # Send two requests as only requests held in scheduler are counted + # as in-flight (the first request is in execution) + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + # Currently the dynamic batcher will form payloads and place to + # instance queue in advance. The batcher doesn't track requests + # in the next stage so need to send more requests to saturate the + # queue. + request_count = 6 + async_results = [] + for _ in range(request_count): + triton_client.async_infer( + model_name, inputs, partial(callback, async_results) + ) + time.sleep(1) + + # Send signal to shutdown the server + os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT) + time.sleep(0.5) + + # Send more requests and should be rejected + try: + triton_client.infer(model_name, inputs) + self.assertTrue(False, "expected error for new inference during shutdown") + except InferenceServerException as ex: + self.assertIn( + "Server is stopping, scheduler for model has stopped accepting new inference requests", + ex.message(), + ) + + # Wait until the results are available in user_data + time_out = 30 + while (len(async_results) < request_count) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + + # Previous requests should succeed + for result in async_results: + if type(result) == InferenceServerException: + raise result + output_data = result.as_numpy("OUTPUT0") + np.testing.assert_allclose( + output_data, input_data, err_msg="Inference result is not correct" + ) + + def test_shutdown_sequence(self): + model_shape = (1, 1) + input_data = np.ones(shape=(1, 1), dtype=np.int32) + + inputs = [grpcclient.InferInput("INPUT", model_shape, "INT32")] + inputs[0].set_data_from_numpy(input_data) + + triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True) + model_name = "custom_sequence_int32" + + # Send two requests as only requests held in scheduler are counted + # as in-flight (the first request is in execution) + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + # Start multiple sequences + request_count = 2 + async_results = [] + for i in range(request_count): + triton_client.async_infer( + model_name, + inputs, + partial(callback, async_results), + sequence_id=(i + 1), + sequence_start=True, + ) + time.sleep(1) + + # Send signal to shutdown the server + os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT) + time.sleep(0.5) + + # Send requests with different characteristic + # 1: New sequence with new sequence ID + try: + triton_client.infer( + model_name, inputs, sequence_id=request_count, sequence_start=True + ) + self.assertTrue(False, "expected error for new inference during shutdown") + except InferenceServerException as ex: + self.assertIn( + "Server is stopping, scheduler for model has stopped accepting new inference requests", + ex.message(), + ) + # 2: New sequence with existing sequence ID + try: + triton_client.infer(model_name, inputs, sequence_id=1, sequence_start=True) + self.assertTrue(False, "expected error for new inference during shutdown") + except InferenceServerException as ex: + self.assertIn( + "Server is stopping, scheduler for model has stopped accepting new inference requests", + ex.message(), + ) + # 3: Continuing sequence + try: + res = triton_client.infer( + model_name, inputs, sequence_id=2, sequence_end=True + ) + output_data = res.as_numpy("OUTPUT") + # Result are accumulated + np.testing.assert_allclose( + output_data, + input_data + input_data, + err_msg="Inference result is not correct", + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Wait until the results are available in user_data + time_out = 30 + while (len(async_results) < request_count) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + + # Previous requests should succeed + for result in async_results: + if type(result) == InferenceServerException: + raise result + output_data = result.as_numpy("OUTPUT") + np.testing.assert_allclose( + output_data, input_data, err_msg="Inference result is not correct" + ) + + # Sleep 5 seconds for scheduler timeout to work and should + # reduce the in-flight count + time.sleep(5) + + def test_shutdown_ensemble(self): + model_shape = (1, 1) + input_data = np.ones(shape=(1, 1), dtype=np.float32) + + inputs = [grpcclient.InferInput("INPUT0", model_shape, "FP32")] + inputs[0].set_data_from_numpy(input_data) + + triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True) + model_name = "ensemble_zero_1_float32" + + # Send two requests as only requests held in scheduler are counted + # as in-flight (the first request is in execution) + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + # Even the ensemble is actually a wrapper over the model for + # test_shutdown_dynamic, we don't need to send many requests as + # ensemble scheduler tracks in-flight requests w.r.t. the whole pipeline + request_count = 1 + async_results = [] + for _ in range(request_count): + triton_client.async_infer( + model_name, inputs, partial(callback, async_results) + ) + time.sleep(1) + + # Send signal to shutdown the server + os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT) + time.sleep(0.5) + + # Send more requests and should be rejected + try: + triton_client.infer(model_name, inputs) + self.assertTrue(False, "expected error for new inference during shutdown") + except InferenceServerException as ex: + self.assertIn("in ensemble 'ensemble_zero_1_float32'", ex.message()) + self.assertIn( + "Server is stopping, scheduler for model has stopped accepting new inference requests", + ex.message(), + ) + + # Wait until the results are available in user_data + time_out = 10 + while (len(async_results) < request_count) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + + # Previous requests should succeed + for result in async_results: + if type(result) == InferenceServerException: + raise result + output_data = result.as_numpy("OUTPUT0") + np.testing.assert_allclose( + output_data, input_data, err_msg="Inference result is not correct" + ) + + def test_load_gpu_limit(self): + model_name = "cuda_memory_consumer" + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + triton_client.load_model(model_name + "_1") + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # After the first load, the memory consumption should have exceeded + # the specified limit, load will fail + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + triton_client.load_model(model_name + "_2") + self.assertTrue(False, "expected error for loading model") + except Exception as ex: + self.assertIn("memory limit set for GPU 0 has exceeded", ex.message()) + + # Load should work after explicitly unload model to free memory + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + triton_client.unload_model(model_name + "_1") + triton_client.load_model(model_name + "_2") + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_concurrent_model_load_speedup(self): + # Initialize client + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + # Each model should have a loading delay of 10 seconds + model_pairs = [ + ["identity_zero_1_int32_1", "identity_zero_1_int32_2"], + ["python_identity_fp32_1", "python_identity_fp32_2"], + ] + # Test each model pair for speed up + for model_pair in model_pairs: + # Load both models concurrently + threads = [] + for model_name in model_pair: + threads.append( + threading.Thread( + target=triton_client.load_model, args=(model_name,) + ) + ) + start_time = time.time() + for thread in threads: + thread.start() + for thread in threads: + thread.join() + end_time = time.time() + loading_time = end_time - start_time + # Each of the two models has a minimum loading delay of 10 seconds + # Speedup is observed when the concurrent loading time < 20 seconds + # but use a tighter bound of 15 seconds + self.assertLess( + loading_time, 15.0, "Concurrent loading speedup not observed" + ) + # Concurrent loading time cannot be < 10 seconds + self.assertGreaterEqual( + loading_time, 10.0, "Invalid concurrent loading time" + ) + # Make sure the models are loaded + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + for model_name in model_pair: + self.assertTrue(triton_client.is_model_ready(model_name)) + + def test_concurrent_model_load(self): + # Initialize client + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + # Load same named model concurrently + with concurrent.futures.ThreadPoolExecutor() as pool: + # First load an 10 seconds delayed identity backend model + thread_1 = pool.submit(triton_client.load_model, "identity_model") + time.sleep(2) # wait between loads + # Switch the model file to python backend + shutil.move("models", "models_v1") + shutil.move("models_v2", "models") + # Second load should be blocked until the first completes + thread_2 = pool.submit(triton_client.load_model, "identity_model") + # Both loads should succeed + thread_1.result() + thread_2.result() + # Check the model is ready + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready("identity_model")) + # Check the finally loaded model is the second one + model_metadata = triton_client.get_model_metadata("identity_model") + self.assertEqual(model_metadata.platform, "python") + + def test_concurrent_model_load_unload(self): + # Initialize client + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + # Load identity_zero_1_int32 and unload it while loading + # The unload operation should wait until the load is completed + with concurrent.futures.ThreadPoolExecutor() as pool: + load_thread = pool.submit(triton_client.load_model, "identity_zero_1_int32") + time.sleep(2) # wait between load and unload + unload_thread = pool.submit( + triton_client.unload_model, "identity_zero_1_int32" + ) + load_thread.result() + unload_thread.result() + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready("identity_zero_1_int32")) + # Load ensemble_zero_1_float32 and unload its dependency while loading + # The unload operation should wait until the load is completed + with concurrent.futures.ThreadPoolExecutor() as pool: + load_thread = pool.submit( + triton_client.load_model, "ensemble_zero_1_float32" + ) + time.sleep(2) # wait between load and unload + unload_thread = pool.submit( + triton_client.unload_model, "custom_zero_1_float32" + ) + load_thread.result() + unload_thread.result() + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready("ensemble_zero_1_float32")) + self.assertFalse(triton_client.is_model_ready("custom_zero_1_float32")) + # Load both models and unload them concurrently + model_names = ["identity_zero_1_int32", "ensemble_zero_1_float32"] + for is_load in [True, False]: + action_fn = ( + triton_client.load_model if is_load else triton_client.unload_model + ) + with concurrent.futures.ThreadPoolExecutor() as pool: + threads = [] + for model_name in model_names: + threads.append(pool.submit(action_fn, model_name)) + for thread in concurrent.futures.as_completed(threads): + thread.result() + for model_name in model_names: + self.assertEqual(is_load, triton_client.is_model_ready(model_name)) + + # TODO: Consider revisiting this test + # The goal of this test is only to ensure the server does not crash when + # bombarded with concurrent load/unload requests for the same model. + # Some clean-up: + # 1. Improve core logic so all load/unload requests will always success, so + # 'load_fail_reasons' and 'unload_fail_reasons' can be removed. + # 2. Is it still necessary to track the ability to replicate a load while + # async unloading? + # 3. What is the ideal number of threads and iterations, across different + # machines, that the server is sufficiently stressed? + def test_concurrent_same_model_load_unload_stress(self): + model_name = "identity_zero_1_int32" + num_threads = 32 + num_iterations = 1024 + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + load_fail_reasons = [ + "unexpected miss in global map", + "no version is available", + "failed to poll from model repository", + ] + unload_fail_reasons = ["versions that are still available: 1"] + load_fail_messages = [ + ("failed to load '" + model_name + "', " + reason) + for reason in load_fail_reasons + ] + unload_fail_messages = [ + ("failed to unload '" + model_name + "', " + reason) + for reason in unload_fail_reasons + ] + global_exception_stats = {} # { "exception message": number of occurrence } + load_before_unload_finish = [False] # use list to access by reference + + def _load_unload(): + exception_stats = {} # { "exception message": number of occurrence } + for i in range(num_iterations): + try: + triton_client.load_model(model_name) + except InferenceServerException as ex: + # Acceptable for an unload to happen after a load completes, only + # before the load can verify its load state. + error_message = ex.message() + self.assertIn(error_message, load_fail_messages) + if error_message not in exception_stats: + exception_stats[error_message] = 0 + exception_stats[error_message] += 1 + try: + triton_client.unload_model(model_name) + except InferenceServerException as ex: + # Acceptable for a load to happen after an unload completes, only + # before the unload can verify its unload state. + error_message = ex.message() + self.assertIn(error_message, unload_fail_messages) + if error_message not in exception_stats: + exception_stats[error_message] = 0 + exception_stats[error_message] += 1 + load_before_unload_finish[0] = True + return exception_stats + + with concurrent.futures.ThreadPoolExecutor() as pool: + threads = [] + for i in range(num_threads): + threads.append(pool.submit(_load_unload)) + for t in threads: + exception_stats = t.result() + for key, count in exception_stats.items(): + if key not in global_exception_stats: + global_exception_stats[key] = 0 + global_exception_stats[key] += count + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + + # This test can replicate a load while async unloading on machines with + # sufficient concurrency. Regardless on whether it is replicated or not, + # the server must not crash. + if load_before_unload_finish[0] == False: + # Track non-replication on test printout via statistics. + warning_msg = "Cannot replicate a load while async unloading. CPU count: {}. num_threads: {}.".format( + multiprocessing.cpu_count(), num_threads + ) + global_exception_stats[warning_msg] = 1 + + stats_path = "./test_concurrent_same_model_load_unload_stress.statistics.log" + with open(stats_path, mode="w", encoding="utf-8") as f: + f.write(str(global_exception_stats) + "\n") + + def test_concurrent_model_instance_load_speedup(self): + # Initialize client + try: + triton_client = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + models = ["identity_fp32"] + # Create 2 instances which each have a delay time of 10 seconds. + num_instances = 2 + instance_group = [{"kind": "KIND_CPU", "count": num_instances}] + config = {"instance_group": instance_group} + for model in models: + # Instances should be loaded concurrently for supported backends + start_time = time.time() + try: + triton_client.load_model(model, config=json.dumps(config)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + end_time = time.time() + loading_time = end_time - start_time + print(f"Time to load {num_instances} instances: {loading_time}") + + # Each of the two models has a minimum loading delay of 10 seconds + # Speedup is observed when the concurrent loading time < 20 seconds + # but use a tighter bound of 15 seconds + self.assertLess( + loading_time, 15.0, "Concurrent loading speedup not observed" + ) + # Concurrent loading time cannot be < 10 seconds + self.assertGreaterEqual( + loading_time, 10.0, "Invalid concurrent loading time" + ) + # Make sure the models are loaded + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model)) + + def _call_with_timeout(self, callable, timeout_secs): + # Setup handler for timing out call + def timeout_handler(sig, frame): + raise TimeoutError() + + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(timeout_secs) + result = callable() + return result + + def _call_with_expected_timeout(self, callable, timeout_secs=3): + # Call callable with expectation that it will timeout + try: + self._call_with_timeout(callable, timeout_secs) + except TimeoutError: + print("Inference timed out as expected.") + return + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + else: + self.assertTrue(False, "unexpected success, call should've timed out.") + + def _get_fp32_io(self, client_type): + # Config + input_names = ["INPUT0", "INPUT1"] + output_names = ["OUTPUT0", "OUTPUT1"] + dtype, dims, shape = ("TYPE_FP32", [-1, 16], [1, 16]) + input_config = [ + {"name": name, "data_type": dtype, "dims": dims} for name in input_names + ] + output_config = [ + {"name": name, "data_type": dtype, "dims": dims} for name in output_names + ] + # Inputs + inputs = [] + for name in input_names: + inputs.append( + client_type.InferInput(name, shape, dtype.replace("TYPE_", "")) + ) + inputs[-1].set_data_from_numpy(np.ones(shape, dtype=np.float32)) + return input_config, output_config, inputs + + def test_concurrent_model_instance_load_sanity(self): + cpu, gpu = "KIND_CPU", "KIND_GPU" + default_kinds = [cpu, gpu] + backend_kinds = {"plan": [gpu], "openvino": [cpu]} + try: + client_type = httpclient + triton_client = client_type.InferenceServerClient( + "localhost:8000", verbose=True + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + backends = os.environ.get("PARALLEL_BACKENDS", "").split() + self.assertTrue(len(backends) > 0, "PARALLEL_BACKENDS wasn't set") + + num_instances = 5 + input_config, output_config, inputs = self._get_fp32_io(client_type) + for backend in backends: + model = tu.get_model_name(backend, np.float32, np.float32, np.float32) + kinds = backend_kinds.get(backend, default_kinds) + for kind in kinds: + with self.subTest(backend=backend, model=model, kind=kind): + # Setup model config + instance_group = {"kind": kind, "count": num_instances} + # Disable batching to guarantee 1 request per instance + # Configure sequence batching such that each instance cannot accept new requests + # while it is busy with an ongoing sequence. This way we can guarantee sending 1 request to each instance. + max_batch_size = 0 + sequence_timeout_secs = 10 + sequence_batching = { + "direct": {}, + "max_sequence_idle_microseconds": sequence_timeout_secs + * 1000000, + } + config = { + "instance_group": instance_group, + "max_batch_size": max_batch_size, + "sequence_batching": sequence_batching, + "input": input_config, + "output": output_config, + } + print( + f"~~~ Backend: [{backend}], Model: [{model}], Config: [{config}] ~~~" + ) + # Load the model + try: + triton_client.load_model(model, config=json.dumps(config)) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Make sure the model is loaded + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_model_ready(model)) + print( + "Model Repository Index after load:", + triton_client.get_model_repository_index(), + ) + + # Test inference on each instance + for i in range(1, num_instances + 1): + try: + triton_client.infer( + model, inputs, sequence_id=i, sequence_start=True + ) + except Exception as ex: + self.assertTrue( + False, "unexpected inference error {}".format(ex) + ) + + # Each instance should be busy until their sequence times out, so + # an additional infer call should time out. If it doesn't time out, something + # is wrong and the test should fail. + callable = partial( + triton_client.infer, + model, + inputs, + sequence_id=num_instances + 1, + sequence_start=True, + ) + self._call_with_expected_timeout(callable, timeout_secs=3) + + # Unload the model + try: + triton_client.unload_model(model) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Allow server to fully unload model before next test iteration + num_tries = 10 + for i in range(num_tries): + if triton_client.is_server_ready(): + break + print( + f"[Attempt {i}] Server not ready yet, sleeping and retrying. Current repository index: {triton_client.get_model_repository_index()}" + ) + time.sleep(6) + print( + "Model Repository Index after unload attempts:", + triton_client.get_model_repository_index(), + ) + self.assertTrue(triton_client.is_server_ready()) + + def test_model_config_overwite(self): + model_name = "identity_fp32" + + # Make sure version 1 of the model is loaded + try: + triton_client = self._get_client() + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Load the model from disk w/o any special configuration settings. + original_config = triton_client.get_model_config(model_name) + + # The instance_group[0].count is set to 2 instead of the default 1. + # This enough of a delta to ensure the correct model configuration + # has been applied to the model. + override_config = """ +{ + "name": "identity_fp32", + "backend": "identity", + "instance_group": [ + { + "count": 2, + "kind" : "KIND_CPU" + } + ] +} +""" + + # Ensure the model has been loaded w/ the expected (different from override) config. + self.assertTrue(original_config != None and original_config != override_config) + + # Reload the model with the overriding configuration value. + triton_client.load_model(model_name, config=override_config) + + # Ensure the model has been loaded w/ the expected (override) config. + updated_config = triton_client.get_model_config(model_name) + + # Reload the model + triton_client.load_model(model_name) + + # Ensure the model has been loaded w/ the expected (override) config. + updated_config2 = triton_client.get_model_config(model_name) + self.assertEqual(updated_config, updated_config2) + + # Touch the local config.pbtxt and reload the file to ensure the local config + # is preferred because it has a more recent mtime. + time.sleep(0.1) # make sure timestamps are different + Path(os.path.join("models", model_name, "config.pbtxt")).touch() + + # Reload the model + triton_client.load_model(model_name) + + # Ensure the model has been loaded w/ the expected (local) config. + updated_config = triton_client.get_model_config(model_name) + self.assertEqual(original_config, updated_config) + + def test_shutdown_while_background_unloading(self): + model_name = "identity_fp32" + triton_client = self._get_client() + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + # Check the Python version of the model is loaded. + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + python_model_config = triton_client.get_model_config(model_name) + self.assertEqual(python_model_config["backend"], "python") + # Load the Identity version, which will put the Python version into the + # background and unload it, the unload will take at least 10 seconds. + override_config = "{\n" + override_config += '"name": "identity_fp32",\n' + override_config += '"backend": "identity"\n' + override_config += "}" + triton_client.load_model(model_name, config=override_config) + identity_model_config = triton_client.get_model_config(model_name) + self.assertEqual(identity_model_config["backend"], "identity") + # The server will shutdown after this sub-test exits. The server must shutdown + # without any hang or runtime error. + + def test_shutdown_while_loading(self): + triton_client = self._get_client() + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + # Load the model which will load for at least 10 seconds. + model_name = "identity_fp32" + with concurrent.futures.ThreadPoolExecutor() as pool: + pool.submit(triton_client.load_model, model_name) + self.assertFalse(triton_client.is_model_ready(model_name)) + # The server will shutdown after this sub-test exits. The server must shutdown + # without any hang or runtime error. + + def test_shutdown_with_live_connection(self): + model_name = "add_sub" + model_shape = (16,) + from geventhttpclient.response import HTTPConnectionClosed + + input_data = np.ones(shape=model_shape, dtype=np.float32) + inputs = [ + httpclient.InferInput("INPUT0", model_shape, "FP32"), + httpclient.InferInput("INPUT1", model_shape, "FP32"), + ] + inputs[0].set_data_from_numpy(input_data) + inputs[1].set_data_from_numpy(input_data) + + # start connection + conn = httpclient.InferenceServerClient("localhost:8000", verbose=True) + conn.infer(model_name, inputs) + + # shutdown the server + os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT) + time.sleep(2) + + # connection should still work + conn.infer(model_name, inputs) + + # close connection + conn.close() + time.sleep(2) + + # check exit timeout countdown did not restart + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertIn( + "Waiting for in-flight requests to complete.", + server_log, + "precondition not met - core shutdown did not begin", + ) + self.assertEqual( + server_log.count("Timeout 30: "), + 1, + "exit timeout countdown restart detected", + ) + + def test_add_custom_config(self): + models_base = ("savedmodel",) + models = list() + for m in models_base: + models.append(tu.get_model_name(m, np.float32, np.float32, np.float32)) + + # Make sure savedmodel and plan are in the status + for model_name in models: + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Add custom model configuration, which cause model to be + # re-loaded and use custom config inside configs folder, which + # means that version policy will change and only version 2 will + # be available. + for base_name, model_name in zip(models_base, models): + shutil.copyfile( + "config.pbtxt.custom." + base_name, + "models/" + model_name + "/configs/custom.pbtxt", + ) + + time.sleep(5) # wait for models to reload + for model_name in models: + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "2")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_delete_custom_config(self): + models_base = ("savedmodel",) + models = list() + for m in models_base: + models.append(tu.get_model_name(m, np.float32, np.float32, np.float32)) + + # Make sure savedmodel and plan are in the status + for model_name in models: + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertFalse(triton_client.is_model_ready(model_name, "1")) + self.assertTrue(triton_client.is_model_ready(model_name, "2")) + self.assertFalse(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Delete custom model configuration, which cause model to be + # re-loaded and use default config, which means that version + # policy will be changed and so only version 1, 3 will be available + for model_name in models: + os.remove("models/" + model_name + "/configs/custom.pbtxt") + + time.sleep(5) # wait for models to reload + for model_name in models: + try: + for triton_client in ( + httpclient.InferenceServerClient("localhost:8000", verbose=True), + grpcclient.InferenceServerClient("localhost:8001", verbose=True), + ): + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue(triton_client.is_model_ready(model_name, "1")) + self.assertFalse(triton_client.is_model_ready(model_name, "2")) + self.assertTrue(triton_client.is_model_ready(model_name, "3")) + except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': + def test_load_new_model_version(self): + model_name = "identity_fp32" + client = self._get_client(use_grpc=True) + + # version 1 and 2 are already loaded + # version 3 is in the model directory but not loaded + # version 4 does not exist anywhere + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertFalse(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 0) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 0) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 1) + + # update version 2 model file + Path(os.path.join("models", model_name, "2", "model.py")).touch() + # add version 4 model file + src_path = os.path.join("models", model_name, "3") + dst_path = os.path.join("models", model_name, "4") + shutil.copytree(src_path, dst_path) + # update model config to load version 1 to 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [1, 2] } }", + "version_policy: { specific: { versions: [1, 2, 3, 4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # version 1 is unmodified so it should not be reloaded + # version 2 is modified so it should be reloaded + # version 3 model file existed but not loaded so it should be loaded + # version 4 is a new version so it should be loaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertTrue(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 1) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 1) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 2) + + # simulate a dependency change to all versions + Path(os.path.join("models", model_name, "dummy_dependency.py")).touch() + # reload the model + client.load_model(model_name) + + # all 4 versions should be reloaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertTrue(client.is_model_ready(model_name, "2")) + self.assertTrue(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 3) + + # update model config to only load version 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [1, 2, 3, 4] } }", + "version_policy: { specific: { versions: [4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # only version 4 should be available and no reloads should happen + self.assertFalse(client.is_model_ready(model_name, "1")) + self.assertFalse(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 4) + + # update model config to load version 1 and 4 + config_path = os.path.join("models", model_name, "config.pbtxt") + with open(config_path, mode="r+", encoding="utf-8", errors="strict") as f: + config = f.read() + config = config.replace( + "version_policy: { specific: { versions: [4] } }", + "version_policy: { specific: { versions: [1, 4] } }", + ) + f.truncate(0) + f.seek(0) + f.write(config) + # reload the model + client.load_model(model_name) + + # version 1 should be loaded and version 4 should not be reloaded + self.assertTrue(client.is_model_ready(model_name, "1")) + self.assertFalse(client.is_model_ready(model_name, "2")) + self.assertFalse(client.is_model_ready(model_name, "3")) + self.assertTrue(client.is_model_ready(model_name, "4")) + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertEqual(server_log.count("[PB model] Loading version 1"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 2"), 3) + self.assertEqual(server_log.count("[PB model] Loading version 3"), 2) + self.assertEqual(server_log.count("[PB model] Loading version 4"), 2) + self.assertEqual(server_log.count("successfully loaded 'identity_fp32'"), 5) + + +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_lifecycle/retry_model/1/model.py b/qa/L0_lifecycle/retry_model/1/model.py new file mode 100644 index 0000000000..49127d0422 --- /dev/null +++ b/qa/L0_lifecycle/retry_model/1/model.py @@ -0,0 +1,79 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def initialize(self, args): + # Check if an special file has been created in the version directory, + # The existence is the indicator of whether the model load has been + # retried (model control mode should NOT be POLL to avoid re-load). + model_path = os.path.join(args["model_repository"], args["model_version"]) + self.indicator_file = os.path.join(model_path, "indicator") + if not os.path.exists(self.indicator_file): + with open(self.indicator_file, "x") as f: + pass + raise Exception("failing first load attempt") + + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def finalize(self): + # Clean up the file on successful load (after first attempt) + os.remove(self.indicator_file) + + def execute(self, requests): + # This model is for testing loading behavior only + # and is not intended to be executed + pass diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh index edc0d418ee..4efd244c76 100755 --- a/qa/L0_lifecycle/test.sh +++ b/qa/L0_lifecycle/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,32 +25,1735 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + CLIENT_LOG="./client.log" +DATADIR=/data/inferenceserver/${REPO_VERSION} LC_TEST=lifecycle_test.py +SLEEP_TIME=10 +SERVER=/opt/tritonserver/bin/tritonserver +TEST_RESULT_FILE='test_results.txt' +source ../common/util.sh + +function check_unit_test() { + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi +} + +RET=0 +rm -fr *.log + +LOG_IDX=0 + +if [ `ps | grep -c "tritonserver"` != "0" ]; then + echo -e "Tritonserver already running" + echo -e `ps | grep tritonserver` + exit 1 +fi + +# LifeCycleTest.test_parse_error_noexit_strict +SERVER_ARGS="--model-repository=/tmp/xyzx --strict-readiness=true \ + --exit-on-error=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_nowait +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +sleep $SLEEP_TIME + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_noexit >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_error_noexit +SERVER_ARGS="--model-repository=/tmp/xyzx --strict-readiness=false \ + --exit-on-error=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_nowait +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +sleep $SLEEP_TIME + + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_noexit >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_error_noexit_strict (multiple model repositories) +rm -rf models +mkdir models +SERVER_ARGS="--model-repository=/tmp/xyzx --model-repository=`pwd`/models \ + --strict-readiness=true --exit-on-error=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_nowait +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +sleep $SLEEP_TIME + + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_noexit >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_error_noexit (multiple model repositories) +rm -rf models +mkdir models +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=/tmp/xyzx \ + --strict-readiness=false --exit-on-error=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_nowait +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +sleep $SLEEP_TIME + + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_noexit >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# GRPC Port Collision Test +rm -rf models +mkdir models +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./stub_inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +SAVED_SERVER_PID=$SERVER_PID +SERVER_ARGS="--model-repository=`pwd`/models --http-port 8003 --metrics-port 8004" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +sleep $SLEEP_TIME +# check server log for the warning messages +if [ `grep -c "failed to start GRPC service: Unavailable - Socket '0.0.0.0:8001' already in use" $SERVER_LOG` != "1" ]; then + echo -e "\n***\n*** Server log ${SERVER_LOG} did not report GRPC port collision\n***" + echo -e "\n***\n*** Test Failed\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +fi + +SERVER_PID=$SAVED_SERVER_PID +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# HTTP Port Collision Test +rm -rf models +mkdir models +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./stub_inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +SAVED_SERVER_PID=$SERVER_PID +SERVER_ARGS="--model-repository=`pwd`/models --grpc-port 8003 --metrics-port 8004" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +sleep $SLEEP_TIME +# check server log for the warning messages +if [ `grep -c "failed to start HTTP service: Unavailable - Socket '0.0.0.0:8000' already in use" $SERVER_LOG` != "1" ]; then + echo -e "\n***\n*** Server log ${SERVER_LOG} did not report HTTP port collision\n***" + echo -e "\n***\n*** Test Failed\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +fi + +SERVER_PID=$SAVED_SERVER_PID + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# Metrics Port Collision Test +rm -rf models +mkdir models +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./stub_inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +SAVED_SERVER_PID=$SERVER_PID +SERVER_ARGS="--model-repository=`pwd`/models --grpc-port 8003 --http-port 8004" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +sleep $SLEEP_TIME +# check server log for the warning messages +if [ `grep -c "failed to start Metrics service: Unavailable - Socket '0.0.0.0:8002' already in use" $SERVER_LOG` != "1" ]; then + echo -e "\n***\n*** Server log ${SERVER_LOG} did not report metrics port collision\n***" + echo -e "\n***\n*** Test Failed\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +fi + +SERVER_PID=$SAVED_SERVER_PID + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# Multiple Port Collisions Test +rm -rf models +mkdir models +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +SAVED_SERVER_PID=$SERVER_PID +run_server +sleep $SLEEP_TIME +# check server log for the warning messages +if [ `grep -c "failed to start.*service: Unavailable - Socket '.*' already in use" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Server log ${SERVER_LOG} did not report port collision\n***" + echo -e "\n***\n*** Test Failed\n***" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +fi + +SERVER_PID=$SAVED_SERVER_PID + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# No Port Collision Test +rm -rf models +mkdir models +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +LOG_IDX=$((LOG_IDX+1)) +SERVER_LOG="./inference_server_$LOG_IDX.log" + +SAVED_SERVER_PID=$SERVER_PID +SERVER_ARGS="--model-repository=`pwd`/models --grpc-port 8003 --http-port 8004 --metrics-port 8005" +run_server +sleep $SLEEP_TIME +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID +kill $SAVED_SERVER_PID +wait $SAVED_SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_error_modelfail +rm -fr models models_0 +mkdir models models_0 +for i in graphdef savedmodel ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +for i in onnx plan ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done +# Change the model files so that multiple versions will be loaded, and one of +# the versions will fail to load and cause all other versions to be unloaded. +rm models/graphdef_float32_float32_float32/3/* + +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --exit-on-error=false --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_tolive +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# give plenty of time for model to load (and fail to load) +wait_for_model_stable $SERVER_TIMEOUT + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_modelfail >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_error_modelfail_nostrict +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --exit-on-error=false --exit-timeout-secs=5 --strict-readiness=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_tolive +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# give plenty of time for model to load (and fail to load) +wait_for_model_stable $SERVER_TIMEOUT + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_modelfail_nostrict >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_error_no_model_config +rm -fr models models_0 +mkdir models models_0 +for i in graphdef savedmodel ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +for i in onnx plan ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done +rm models/graphdef_float32_float32_float32/config.pbtxt + +# Autocomplete should not be turned on for this test because it asserts an error was logged +# when in strict model configuration mode. +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --exit-on-error=false --exit-timeout-secs=5 --strict-model-config=true" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_tolive +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# give plenty of time for model to load (and fail to load) +wait_for_model_stable $SERVER_TIMEOUT + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_no_model_config >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# check server log for the warning messages +if [ `grep -c "failed to open text file for read" $SERVER_LOG` == "0" ] || [ `grep -c "graphdef_float32_float32_float32/config.pbtxt: No such file or directory" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Server log ${SERVER_LOG} did not print model load failure\n***" + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_init_error_modelfail +rm -fr models models_0 +mkdir models models_0 +cp -r $DATADIR/qa_sequence_model_repository/onnx_sequence_int32 models/. +cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 models_0/. +sed -i "s/OUTPUT/_OUTPUT/" models/onnx_sequence_int32/config.pbtxt +sed -i "s/OUTPUT/_OUTPUT/" models_0/onnx_int32_int32_int32/config.pbtxt +for i in graphdef savedmodel; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --exit-on-error=false --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_tolive +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# give plenty of time for model to load (and fail to load) +wait_for_model_stable $SERVER_TIMEOUT + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_init_error_modelfail >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_error_model_no_version +rm -fr models +mkdir models +for i in savedmodel onnx plan ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +mkdir -p models/graphdef_float32_float32_float32 +cp $DATADIR/qa_model_repository/graphdef_float32_float32_float32/config.pbtxt \ + models/graphdef_float32_float32_float32/. + +SERVER_ARGS="--model-repository=`pwd`/models --exit-on-error=false \ + --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server_tolive +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# give plenty of time for model to load (and fail to load) +wait_for_model_stable $SERVER_TIMEOUT + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_error_model_no_version >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_ignore_zero_prefixed_version +rm -fr models +mkdir models +for i in savedmodel ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + mv models/${i}_float32_float32_float32/3 models/${i}_float32_float32_float32/003 +done + +SERVER_ARGS="--model-repository=`pwd`/models --exit-on-error=false \ + --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_ignore_zero_prefixed_version >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# check server log for the warning messages +if [ `grep -c "ignore version directory '003' which contains leading zeros in its directory name" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_parse_ignore_non_intergral_version +rm -fr models +mkdir models +for i in savedmodel ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + mv models/${i}_float32_float32_float32/3 models/${i}_float32_float32_float32/abc +done + +SERVER_ARGS="--model-repository=`pwd`/models --exit-on-error=false \ + --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_parse_ignore_non_intergral_version >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# check server log for the warning messages +if [ `grep -c "ignore version directory 'abc' which fails to convert to integral number" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_dynamic_model_load_unload +rm -fr models savedmodel_float32_float32_float32 +mkdir models +for i in graphdef onnx plan ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 . + +SERVER_ARGS="--model-repository=`pwd`/models --repository-poll-secs=1 \ + --model-control-mode=poll --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_dynamic_model_load_unload >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_dynamic_model_load_unload_disabled +rm -fr models savedmodel_float32_float32_float32 +mkdir models +for i in graphdef onnx plan; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 . + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=none \ + --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_dynamic_model_load_unload_disabled >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_dynamic_version_load_unload +rm -fr models +mkdir models +for i in graphdef ; do + cp -r $DATADIR/qa_model_repository/${i}_int32_int32_int32 models/. +done + +SERVER_ARGS="--model-repository=`pwd`/models --repository-poll-secs=1 \ + --model-control-mode=poll --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_dynamic_version_load_unload >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_dynamic_version_load_unload_disabled +rm -fr models +mkdir models +for i in graphdef ; do + cp -r $DATADIR/qa_model_repository/${i}_int32_int32_int32 models/. +done + +# Show model control mode will override deprecated model control options +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=none \ + --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_dynamic_version_load_unload_disabled >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_dynamic_model_modify +rm -fr models config.pbtxt.* +mkdir models +for i in savedmodel plan ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + sed '/^version_policy/d' \ + $DATADIR/qa_model_repository/${i}_float32_float32_float32/config.pbtxt > config.pbtxt.${i} + sed 's/output0_labels/wrong_output0_labels/' \ + $DATADIR/qa_model_repository/${i}_float32_float32_float32/config.pbtxt > config.pbtxt.wrong.${i} + sed 's/label/label9/' \ + $DATADIR/qa_model_repository/${i}_float32_float32_float32/output0_labels.txt > \ + models/${i}_float32_float32_float32/wrong_output0_labels.txt +done + +SERVER_ARGS="--model-repository=`pwd`/models --repository-poll-secs=1 \ + --model-control-mode=poll --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_dynamic_model_modify >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_dynamic_file_delete +rm -fr models config.pbtxt.* +mkdir models +for i in savedmodel plan; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done + +SERVER_ARGS="--model-repository=`pwd`/models --repository-poll-secs=1 \ + --model-control-mode=poll --exit-timeout-secs=5 --strict-model-config=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_dynamic_file_delete >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_multiple_model_repository_polling +rm -fr models models_0 savedmodel_float32_float32_float32 +mkdir models models_0 +for i in graphdef ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done +cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 . +cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 models/. && \ + rm -rf models/savedmodel_float32_float32_float32/3 + +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --model-control-mode=poll --repository-poll-secs=1 --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_multiple_model_repository_polling >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_multiple_model_repository_control +rm -fr models models_0 savedmodel_float32_float32_float32 +mkdir models models_0 +for i in graphdef ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done +cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 . +cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 models/. && \ + rm -rf models/savedmodel_float32_float32_float32/3 + +# Show model control mode will override deprecated model control options +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --model-control-mode=explicit \ + --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_multiple_model_repository_control >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_model_control +rm -fr models config.pbtxt.* +mkdir models +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_ensemble_model_repository/qa_model_repository/simple_${i}_float32_float32_float32 models/. + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/simple_${i}_float32_float32_float32/config.pbtxt +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --exit-timeout-secs=5 --strict-model-config=false + --strict-readiness=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_model_control >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_model_control_fail +rm -fr models config.pbtxt.* +mkdir models +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + # Remove all model files so the model will fail to load + rm models/${i}_float32_float32_float32/*/* + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --exit-timeout-secs=5 --strict-model-config=false + --strict-readiness=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_model_control_fail >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_model_control_ensemble +rm -fr models config.pbtxt.* +mkdir models +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_ensemble_model_repository/qa_model_repository/simple_${i}_float32_float32_float32 models/. + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/simple_${i}_float32_float32_float32/config.pbtxt +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --exit-timeout-secs=5 --strict-model-config=false + --strict-readiness=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_model_control_ensemble >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_multiple_model_repository_control_startup_models +rm -fr models models_0 config.pbtxt.* +mkdir models models_0 +# Ensemble models in the second repository +for i in plan onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_ensemble_model_repository/qa_model_repository/simple_${i}_float32_float32_float32 models_0/. + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models_0/simple_${i}_float32_float32_float32/config.pbtxt +done + +# savedmodel doesn't load because it is duplicated in 2 repositories +for i in savedmodel ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --model-control-mode=explicit \ + --strict-readiness=false \ + --strict-model-config=false --exit-on-error=false \ + --load-model=savedmodel_float32_float32_float32 \ + --load-model=plan_float32_float32_float32 \ + --load-model=simple_onnx_float32_float32_float32" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_multiple_model_repository_control_startup_models >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# Test loading all models on startup in EXPLICIT model control mode, re-use +# existing LifeCycleTest.test_multiple_model_repository_control_startup_models +# unit test +rm -fr models models_0 config.pbtxt.* +mkdir models models_0 +# Ensemble models in the second repository +for i in plan onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_ensemble_model_repository/qa_model_repository/simple_${i}_float32_float32_float32 models_0/. + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models_0/simple_${i}_float32_float32_float32/config.pbtxt +done + +# savedmodel doesn't load because it is duplicated in 2 repositories +for i in savedmodel ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --model-control-mode=explicit \ + --strict-readiness=false \ + --strict-model-config=false --exit-on-error=false \ + --load-model=*" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_multiple_model_repository_control_startup_models >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# Test loading all models on startup in EXPLICIT model control mode AND +# an additional --load-model argument, it should fail +rm -fr models +mkdir models +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt +done + +# --load-model=* can not be used with any other --load-model arguments +# as it's unclear what the user's intentions are. +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --model-control-mode=explicit \ + --strict-readiness=true \ + --exit-on-error=true \ + --load-model=* \ + --load-model=onnx_float32_float32_float32" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Failed: $SERVER started successfully when it was expected to fail\n***" + cat $SERVER_LOG + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi + +LOG_IDX=$((LOG_IDX+1)) + +# Test loading a startup model that doesn't exist, it should fail +rm -fr models && mkdir models +INVALID_MODEL="does-not-exist" +SERVER_ARGS="--model-repository=`pwd`/models \ + --model-control-mode=explicit \ + --strict-readiness=true \ + --exit-on-error=true \ + --load-model=${INVALID_MODEL}" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Failed: $SERVER started successfully when it was expected to fail\n***" + echo -e "ERROR: Startup model [${INVALID_MODEL}] should have failed to load." + cat $SERVER_LOG + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi +# check server log for the error messages to make sure they're printed +if [ `grep -c "model not found in any model repository" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Server log ${SERVER_LOG} did not print model load failure for non-existent model\n***" + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_model_repository_index +rm -fr models models_0 config.pbtxt.* +mkdir models models_0 +# Ensemble models in the second repository +for i in graphdef savedmodel ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_ensemble_model_repository/qa_model_repository/simple_${i}_float32_float32_float32 models_0/. +done + +# onnx doesn't load because it is duplicated in 2 repositories +for i in onnx ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models_0/. +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models_0 \ + --model-control-mode=explicit \ + --strict-readiness=false \ + --strict-model-config=false --exit-on-error=false \ + --load-model=onnx_float32_float32_float32 \ + --load-model=graphdef_float32_float32_float32 \ + --load-model=simple_savedmodel_float32_float32_float32" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_model_repository_index >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_model_availability_on_reload +for protocol in grpc http; do + if [[ $protocol == "grpc" ]]; then + export TRITONSERVER_USE_GRPC=1 + fi + rm -fr models config.pbtxt.* + mkdir models + cp -r identity_zero_1_int32 models/. && mkdir -p models/identity_zero_1_int32/1 + + SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --exit-timeout-secs=5 --strict-model-config=false \ + --load-model=identity_zero_1_int32 \ + --strict-readiness=false" + SERVER_LOG="./inference_server_$LOG_IDX.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + rm -f $CLIENT_LOG + set +e + python $LC_TEST LifeCycleTest.test_model_availability_on_reload >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + unset TRITONSERVER_USE_GRPC + + LOG_IDX=$((LOG_IDX+1)) +done + +# LifeCycleTest.test_model_availability_on_reload_2 +for protocol in grpc http; do + if [[ $protocol == "grpc" ]]; then + export TRITONSERVER_USE_GRPC=1 + fi + rm -fr models config.pbtxt.* + mkdir models + cp -r identity_zero_1_int32 models/. \ + && mkdir -p models/identity_zero_1_int32/1 \ + && mkdir -p models/identity_zero_1_int32/2 + echo "version_policy: { specific { versions: [1] }}" >> models/identity_zero_1_int32/config.pbtxt + cp identity_zero_1_int32/config.pbtxt config.pbtxt.v2 + echo "version_policy: { specific { versions: [2] }}" >> config.pbtxt.v2 + + SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --exit-timeout-secs=5 --strict-model-config=false \ + --load-model=identity_zero_1_int32 \ + --strict-readiness=false" + SERVER_LOG="./inference_server_$LOG_IDX.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + rm -f $CLIENT_LOG + set +e + python $LC_TEST LifeCycleTest.test_model_availability_on_reload_2 >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + unset TRITONSERVER_USE_GRPC + + LOG_IDX=$((LOG_IDX+1)) +done + +# LifeCycleTest.test_model_availability_on_reload_3 +for protocol in grpc http; do + if [[ $protocol == "grpc" ]]; then + export TRITONSERVER_USE_GRPC=1 + fi + rm -fr models config.pbtxt.* + mkdir models + cp -r identity_zero_1_int32 models/. \ + && mkdir -p models/identity_zero_1_int32/1 \ + && mkdir -p models/identity_zero_1_int32/2 + echo "version_policy: { specific { versions: [1] }}" >> models/identity_zero_1_int32/config.pbtxt + cp models/identity_zero_1_int32/config.pbtxt config.pbtxt.new + + SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --exit-timeout-secs=5 --strict-model-config=false \ + --load-model=identity_zero_1_int32 \ + --strict-readiness=false" + SERVER_LOG="./inference_server_$LOG_IDX.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + rm -f $CLIENT_LOG + set +e + python $LC_TEST LifeCycleTest.test_model_availability_on_reload_3 >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + unset TRITONSERVER_USE_GRPC + + LOG_IDX=$((LOG_IDX+1)) +done + +# LifeCycleTest.test_model_reload_fail +rm -fr models config.pbtxt.* +mkdir models +cp -r identity_zero_1_int32 models/. && \ + mkdir -p models/identity_zero_1_int32/1 && \ + cp libtriton_identity.so models/identity_zero_1_int32/1/. && \ + mkdir -p models/identity_zero_1_int32/2 && \ + cp libtriton_identity.so models/identity_zero_1_int32/2/. +echo "version_policy: { specific { versions: [1] }}" >> models/identity_zero_1_int32/config.pbtxt +cp identity_zero_1_int32/config.pbtxt config.pbtxt.v2.gpu && \ + echo "version_policy: { specific { versions: [2] }}" >> config.pbtxt.v2.gpu && \ + sed -i "s/KIND_CPU/KIND_GPU/" config.pbtxt.v2.gpu + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --exit-timeout-secs=5 --strict-model-config=false \ + --load-model=identity_zero_1_int32 \ + --strict-readiness=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_model_reload_fail >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# check server log for the warning messages +if [ `grep -c "failed to load 'identity_zero_1_int32' version 2: Internal: GPU instances not supported" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Server log ${SERVER_LOG} did not print model load failure\n***" + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_load_same_model_different_platform +for protocol in grpc http; do + if [[ $protocol == "grpc" ]]; then + export TRITONSERVER_USE_GRPC=1 + fi + + # The OS file system is more granular when determining modification time, + # the modification timestamp is updated when the file content is changed in + # place, but not updated when the file is copied or moved. With Triton, any + # operation that changes a file is a modification. Thus, preparing the + # models backward will test when a replacement model is having an earlier or + # equal modification timestamp than the current model, Triton must still + # detect the model is modified and proceed with model reload. + for prep_order in normal reverse; do + rm -fr models simple_float32_float32_float32 + mkdir models + # Prepare two models of different platforms, but with the same name + if [[ $prep_order == "normal" ]]; then + # Prepare the TRT model first, then the pytorch model + cp -r $DATADIR/qa_model_repository/plan_float32_float32_float32 models/simple_float32_float32_float32 + sed -i "s/plan_float32_float32_float32/simple_float32_float32_float32/" models/simple_float32_float32_float32/config.pbtxt + cp -r $DATADIR/qa_model_repository/libtorch_float32_float32_float32 simple_float32_float32_float32 + sed -i "s/libtorch_float32_float32_float32/simple_float32_float32_float32/" simple_float32_float32_float32/config.pbtxt + else + # Prepare the pytorch model first, then the TRT model + cp -r $DATADIR/qa_model_repository/libtorch_float32_float32_float32 simple_float32_float32_float32 + sed -i "s/libtorch_float32_float32_float32/simple_float32_float32_float32/" simple_float32_float32_float32/config.pbtxt + cp -r $DATADIR/qa_model_repository/plan_float32_float32_float32 models/simple_float32_float32_float32 + sed -i "s/plan_float32_float32_float32/simple_float32_float32_float32/" models/simple_float32_float32_float32/config.pbtxt + fi + + SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit \ + --load-model=simple_float32_float32_float32 \ + --exit-timeout-secs=5" + SERVER_LOG="./inference_server_$LOG_IDX.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + rm -f $CLIENT_LOG + set +e + python $LC_TEST LifeCycleTest.test_load_same_model_different_platform >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + LOG_IDX=$((LOG_IDX+1)) + done + + unset TRITONSERVER_USE_GRPC +done + +# Send HTTP request to control endpoint +rm -fr models config.pbtxt.* +mkdir models +for i in graphdef savedmodel onnx plan ; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +done + +# Polling enabled (default), control API should not work +# This test also keeps using "--model-store" to ensure backward compatibility +SERVER_ARGS="--model-store=`pwd`/models --repository-poll-secs=0 \ + --exit-timeout-secs=5 --strict-model-config=false \ + --model-control-mode=poll" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# unload API should return bad request +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/graphdef_float32_float32_float32/unload` +set -e +if [ "$code" == "200" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# the model should be available/ready +set +e +code=`curl -s -w %{http_code} localhost:8000/v2/models/graphdef_float32_float32_float32/ready` +set -e +if [ "$code" != "200" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# remove model file so that if reload is triggered, model will become unavailable +rm models/graphdef_float32_float32_float32/*/* + +# load API should return bad request +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/graphdef_float32_float32_float32/load` +set -e +if [ "$code" == "200" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# the model should be available/ready +set +e +code=`curl -s -w %{http_code} localhost:8000/v2/models/graphdef_float32_float32_float32/ready` +set -e +if [ "$code" != "200" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# Send HTTP request to invalid endpoints. This should be replaced by +# some more comprehensive fuzz attacks. +rm -fr models +mkdir models +for i in graphdef ; do + cp -r $DATADIR/qa_model_repository/${i}_int32_int32_int32 models/. +done + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=none \ + --exit-timeout-secs=5" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/notapi/v2` +set -e +if [ "$code" != "404" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/notapi` +set -e +if [ "$code" != "404" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/models/notapi/foo` +set -e +if [ "$code" != "404" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_config_override +rm -fr models config.pbtxt.* +mkdir models +cp -r $DATADIR/qa_model_repository/onnx_float32_float32_float32 models/. +# Make only version 2 is valid version directory while config requests 1, 3 +rm models/onnx_float32_float32_float32/1/* +rm models/onnx_float32_float32_float32/3/* + +SERVER_ARGS="--model-repository=`pwd`/models --model-repository=`pwd`/models \ + --model-control-mode=explicit \ + --strict-model-config=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_config_override >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +rm -f $CLIENT_LOG + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_file_override +rm -fr models config.pbtxt.* +mkdir models +cp -r $DATADIR/qa_model_repository/onnx_float32_float32_float32 models/. +# Make only version 2, 3 is valid version directory while config requests 1, 3 +rm -rf models/onnx_float32_float32_float32/1 + +# Start with EXPLICIT mode and load onnx_float32_float32_float32 +SERVER_ARGS="--model-repository=`pwd`/models \ + --model-control-mode=explicit \ + --load-model=onnx_float32_float32_float32 \ + --strict-model-config=false" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_file_override >>$CLIENT_LOG 2>&1 +check_unit_test +python $LC_TEST LifeCycleTest.test_file_override_security >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +rm -f $CLIENT_LOG + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_shutdown_dynamic +rm -fr models config.pbtxt.* +mkdir models +cp -r ../custom_models/custom_zero_1_float32 models/. && \ + mkdir -p models/custom_zero_1_float32/1 && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching {}" >> config.pbtxt + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"5000\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Server will be shutdown in test script, need to make PID available in script +SERVER_PID=$SERVER_PID python $LC_TEST LifeCycleTest.test_shutdown_dynamic >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +# check server log +if [ `grep -c "Model 'custom_zero_1_float32' (version 1) has 1 in-flight inferences" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Expect logging for model and in-flight inference count\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +rm -f $CLIENT_LOG + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_shutdown_sequence +rm -fr models config.pbtxt.* +mkdir models +cp -r ../custom_models/custom_sequence_int32 models/. && \ + mkdir -p models/custom_sequence_int32/1 + +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Server will be shutdown in test script, need to make PID available in script +SERVER_PID=$SERVER_PID python $LC_TEST LifeCycleTest.test_shutdown_sequence >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +# check server log +if [ `grep -c "Model 'custom_sequence_int32' (version 1) has 2 in-flight inferences" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Expect logging for model having 2 in-flight inferences\n***" + RET=1 +fi +if [ `grep -c "Model 'custom_sequence_int32' (version 1) has 1 in-flight inferences" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Expect logging for model having 1 in-flight inference\n***" + RET=1 +fi -DATADIR=/data/inferenceserver +kill $SERVER_PID +wait $SERVER_PID -SERVER=/opt/tensorrtserver/bin/trtserver -source ../common/util.sh +rm -f $CLIENT_LOG -RET=0 -rm -fr *.log +LOG_IDX=$((LOG_IDX+1)) -# LifeCycleTest.test_parse_error_noexit_strict -SERVER_ARGS="--model-store=$DATADIR/qa_model_repository --strict-readiness=true --exit-on-error=false --platform-config-file=/tmp/dhweiu" -SERVER_LOG="./inference_server_0.log" -run_server_nowait +# LifeCycleTest.test_shutdown_ensemble +rm -fr models config.pbtxt.* +mkdir models +cp -r ensemble_zero_1_float32 models/. && \ + mkdir -p models/ensemble_zero_1_float32/1 +cp -r ../custom_models/custom_zero_1_float32 models/. && \ + mkdir -p models/custom_zero_1_float32/1 && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching {}" >> config.pbtxt + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"5000\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Server will be shutdown in test script, need to make PID available in script +SERVER_PID=$SERVER_PID python $LC_TEST LifeCycleTest.test_shutdown_ensemble >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +# check server log +if [ `grep -c "Model 'ensemble_zero_1_float32' (version 1) has 1 in-flight inferences" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Expect logging for model and in-flight inference count\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_load_gpu_limit +# dependency of the Python model to be used +pip install cuda-python +rm -fr models config.pbtxt.* +mkdir models +cp -r ../python_models/cuda_memory_consumer models/cuda_memory_consumer_1 && \ + cp -r ../python_models/cuda_memory_consumer models/cuda_memory_consumer_2 + +# Negative testing +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --model-load-gpu-limit -1:0.6" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** unexpected start $SERVER\n***" + cat $SERVER_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID +elif [ `grep -c "expects device ID >= 0, got -1" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Expect error on invalid device\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --model-load-gpu-limit 0:-0.4" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** unexpected start $SERVER\n***" + cat $SERVER_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID +elif [ `grep -c "expects limit fraction to be in range \[0.0, 1.0\], got -0.4" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** Expect error on invalid fraction\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# Run server to stop model loading if > 60% of GPU 0 memory is used +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --model-load-gpu-limit 0:0.6" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $LC_TEST LifeCycleTest.test_load_gpu_limit >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_concurrent_model_load_speedup +rm -rf models +mkdir models +MODEL_NAME="identity_zero_1_int32" +cp -r ${MODEL_NAME} models && mkdir -p models/${MODEL_NAME}/1 +cp -r models/${MODEL_NAME} models/${MODEL_NAME}_1 && \ + sed -i "s/${MODEL_NAME}/${MODEL_NAME}_1/" models/${MODEL_NAME}_1/config.pbtxt +mv models/${MODEL_NAME} models/${MODEL_NAME}_2 && \ + sed -i "s/${MODEL_NAME}/${MODEL_NAME}_2/" models/${MODEL_NAME}_2/config.pbtxt +MODEL_NAME="identity_fp32" +cp -r ../python_models/${MODEL_NAME} models && (cd models/${MODEL_NAME} && \ + mkdir 1 && mv model.py 1 && \ + echo " def initialize(self, args):" >> 1/model.py && \ + echo " import time" >> 1/model.py && \ + echo " time.sleep(10)" >> 1/model.py) +cp -r models/${MODEL_NAME} models/python_${MODEL_NAME}_1 && \ + sed -i "s/${MODEL_NAME}/python_${MODEL_NAME}_1/" models/python_${MODEL_NAME}_1/config.pbtxt +mv models/${MODEL_NAME} models/python_${MODEL_NAME}_2 && \ + sed -i "s/${MODEL_NAME}/python_${MODEL_NAME}_2/" models/python_${MODEL_NAME}_2/config.pbtxt + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG exit 1 fi -sleep 5 -rm -f $CLIENT_LOG set +e -python $LC_TEST LifeCycleTest.test_parse_error_noexit_strict >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_model_load_speedup >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 fi @@ -59,21 +1762,33 @@ set -e kill $SERVER_PID wait $SERVER_PID -# LifeCycleTest.test_parse_error_noexit -SERVER_ARGS="--model-store=$DATADIR/qa_model_repository --strict-readiness=false --exit-on-error=false --platform-config-file=/tmp/dhweiu" -SERVER_LOG="./inference_server_1.log" -run_server_nowait +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_concurrent_model_load +rm -rf models models_v1 models_v2 +mkdir models models_v2 +cp -r identity_zero_1_int32 models/identity_model && \ + (cd models/identity_model && \ + mkdir 1 && \ + sed -i "s/identity_zero_1_int32/identity_model/" config.pbtxt) +cp -r ../python_models/identity_fp32 models_v2/identity_model && \ + (cd models_v2/identity_model && \ + mkdir 1 && mv model.py 1 && \ + sed -i "s/identity_fp32/identity_model/" config.pbtxt) + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG exit 1 fi -sleep 5 -rm -f $CLIENT_LOG set +e -python $LC_TEST LifeCycleTest.test_parse_error_noexit >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_model_load >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 fi @@ -82,47 +1797,89 @@ set -e kill $SERVER_PID wait $SERVER_PID -# LifeCycleTest.test_parse_error_modelfail -rm -fr models +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_concurrent_model_load_unload +rm -rf models mkdir models -for i in graphdef savedmodel netdef plan ; do - cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. -done -rm models/graphdef_float32_float32_float32/*/* +cp -r identity_zero_1_int32 models && mkdir -p models/identity_zero_1_int32/1 +cp -r ensemble_zero_1_float32 models && mkdir -p models/ensemble_zero_1_float32/1 +cp -r ../custom_models/custom_zero_1_float32 models/. && \ + mkdir -p models/custom_zero_1_float32/1 && \ + (cd models/custom_zero_1_float32 && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"creation_delay_sec\"; value: { string_value: \"10\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) -SERVER_ARGS="--model-store=`pwd`/models --exit-on-error=false --exit-timeout-secs=5" -SERVER_LOG="./inference_server_2.log" -run_server_tolive +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG exit 1 fi -# give plenty of time for model to load (and fail to load) -sleep 10 +set +e +python $LC_TEST LifeCycleTest.test_concurrent_model_load_unload >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_concurrent_same_model_load_unload_stress +rm -rf models +mkdir models +cp -r identity_zero_1_int32 models && \ + (cd models/identity_zero_1_int32 && \ + mkdir 1 && \ + sed -i "s/string_value: \"10\"/string_value: \"0\"/" config.pbtxt) + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --model-load-thread-count=32 --log-verbose=2" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi set +e -python $LC_TEST LifeCycleTest.test_parse_error_modelfail >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_same_model_load_unload_stress >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 +else + cat ./test_concurrent_same_model_load_unload_stress.statistics.log fi set -e kill $SERVER_PID wait $SERVER_PID -# LifeCycleTest.test_dynamic_model_load_unload -rm -fr models savedmodel_float32_float32_float32 +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_concurrent_model_instance_load_speedup +rm -rf models mkdir models -for i in graphdef netdef plan ; do - cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. -done -cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 . +MODEL_NAME="identity_fp32" +cp -r ../python_models/${MODEL_NAME} models/ && (cd models/${MODEL_NAME} && \ + mkdir 1 && mv model.py 1 && \ + echo " def initialize(self, args):" >> 1/model.py && \ + echo " import time" >> 1/model.py && \ + echo " time.sleep(10)" >> 1/model.py) +rm models/${MODEL_NAME}/config.pbtxt -SERVER_ARGS="--model-store=`pwd`/models --repository-poll-secs=1 --exit-timeout-secs=5" -SERVER_LOG="./inference_server_3.log" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit" +SERVER_LOG="./inference_server_$LOG_IDX.log" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -131,8 +1888,9 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_dynamic_model_load_unload >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_model_instance_load_speedup >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 fi @@ -141,17 +1899,28 @@ set -e kill $SERVER_PID wait $SERVER_PID -# LifeCycleTest.test_dynamic_model_load_unload_disabled -rm -fr models savedmodel_float32_float32_float32 +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_concurrent_model_instance_load_sanity +rm -rf models mkdir models -for i in graphdef netdef plan ; do - cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. +# Sanity check loading multiple instances in parallel for each supported backend +PARALLEL_BACKENDS="python onnx" +for backend in ${PARALLEL_BACKENDS} ; do + model="${backend}_float32_float32_float32" + model_dir="models/${model}" + if [[ $backend == "python" ]]; then + cp -r ../python_models/identity_fp32 ${model_dir} + mkdir ${model_dir}/1 && mv ${model_dir}/model.py ${model_dir}/1 + rm ${model_dir}/config.pbtxt + else + mkdir models/${model} + cp -r $DATADIR/qa_model_repository/${model}/1 models/${model}/1 + fi done -cp -r $DATADIR/qa_model_repository/savedmodel_float32_float32_float32 . -SERVER_ARGS="--model-store=`pwd`/models --allow-poll-model-repository=false \ - --repository-poll-secs=1 --exit-timeout-secs=5" -SERVER_LOG="./inference_server_4.log" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --log-verbose=2" +SERVER_LOG="./inference_server_$LOG_IDX.log" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -160,8 +1929,9 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_dynamic_model_load_unload_disabled >>$CLIENT_LOG 2>&1 +PARALLEL_BACKENDS=${PARALLEL_BACKENDS} python $LC_TEST LifeCycleTest.test_concurrent_model_instance_load_sanity >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 fi @@ -170,15 +1940,35 @@ set -e kill $SERVER_PID wait $SERVER_PID -# LifeCycleTest.test_dynamic_version_load_unload -rm -fr models +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_load_retry +rm -fr models config.pbtxt.* mkdir models -for i in graphdef ; do - cp -r $DATADIR/qa_model_repository/${i}_int32_int32_int32 models/. -done +cp -r retry_model models/. + +# Start without retry and the server should fail to start +SERVER_ARGS="--model-repository=`pwd`/models \ + --model-control-mode=none" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Failed: $SERVER started successfully when it was expected to fail\n***" + cat $SERVER_LOG + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi + +rm -fr models config.pbtxt.* +mkdir models +cp -r retry_model models/. -SERVER_ARGS="--model-store=`pwd`/models --repository-poll-secs=1 --exit-timeout-secs=5" -SERVER_LOG="./inference_server_5.log" +SERVER_ARGS="--model-repository=`pwd`/models \ + --model-control-mode=none \ + --model-load-retry-count=1" +SERVER_LOG="./inference_server_$LOG_IDX.log" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -186,9 +1976,44 @@ if [ "$SERVER_PID" == "0" ]; then exit 1 fi +# the model should be available/ready set +e -python $LC_TEST LifeCycleTest.test_dynamic_version_load_unload >>$CLIENT_LOG 2>&1 +code=`curl -s -w %{http_code} localhost:8000/v2/models/retry_model/ready` +set -e +if [ "$code" != "200" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_model_config_overwrite +rm -rf models +mkdir models +MODEL_NAME="identity_fp32" +cp -r ../python_models/${MODEL_NAME} models/ && (cd models/${MODEL_NAME} && \ + mkdir 1 && mv model.py 1 && \ + echo " def initialize(self, args):" >> 1/model.py && \ + echo " import time" >> 1/model.py && \ + echo " time.sleep(5)" >> 1/model.py) +rm models/${MODEL_NAME}/config.pbtxt + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --load-model ${MODEL_NAME}" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $LC_TEST LifeCycleTest.test_model_config_overwite >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 fi @@ -197,16 +2022,20 @@ set -e kill $SERVER_PID wait $SERVER_PID -# LifeCycleTest.test_dynamic_version_load_unload_disabled -rm -fr models +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_shutdown_while_background_unloading +rm -rf models mkdir models -for i in graphdef ; do - cp -r $DATADIR/qa_model_repository/${i}_int32_int32_int32 models/. -done +MODEL_NAME="identity_fp32" +cp -r ../python_models/${MODEL_NAME} models/ && (cd models/${MODEL_NAME} && \ + mkdir 1 && mv model.py 1 && \ + echo " def finalize(self):" >> 1/model.py && \ + echo " import time" >> 1/model.py && \ + echo " time.sleep(10)" >> 1/model.py) -SERVER_ARGS="--model-store=`pwd`/models --repository-poll-secs=1 \ - --allow-poll-model-repository=false --exit-timeout-secs=5" -SERVER_LOG="./inference_server_6.log" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --load-model ${MODEL_NAME} --log-verbose=2" +SERVER_LOG="./inference_server_$LOG_IDX.log" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -215,8 +2044,9 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_dynamic_version_load_unload_disabled >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_shutdown_while_background_unloading >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 fi @@ -225,17 +2055,26 @@ set -e kill $SERVER_PID wait $SERVER_PID -# LifeCycleTest.test_dynamic_model_modify -rm -fr models config.pbtxt.* +NUMBER_OF_MODELS_UNLOADED=`grep -o "successfully unloaded" $SERVER_LOG | wc -l` +if [ $NUMBER_OF_MODELS_UNLOADED -ne 2 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Unexpected number of successfully unloaded models\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_shutdown_while_loading +rm -rf models mkdir models -for i in savedmodel plan ; do - cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. - sed '/^version_policy/d' \ - $DATADIR/qa_model_repository/${i}_float32_float32_float32/config.pbtxt > config.pbtxt.${i} -done +cp -r ../python_models/identity_fp32 models/ && (cd models/identity_fp32 && \ + mkdir 1 && mv model.py 1 && \ + echo " def initialize(self, args):" >> 1/model.py && \ + echo " import time" >> 1/model.py && \ + echo " time.sleep(10)" >> 1/model.py) -SERVER_ARGS="--model-store=`pwd`/models --repository-poll-secs=1 --exit-timeout-secs=5" -SERVER_LOG="./inference_server_7.log" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --log-verbose=2" +SERVER_LOG="./inference_server_$LOG_IDX.log" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -244,8 +2083,46 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_dynamic_model_modify >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_shutdown_while_loading >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +ACTUAL_LOAD_UNLOAD_ORDER="`grep -o -e 'AsyncUnload()' -e 'OnLoadFinal()' $SERVER_LOG`" +EXPECTED_LOAD_UNLOAD_ORDER="`echo -e 'OnLoadFinal()\nAsyncUnload()'`" +if [ "$ACTUAL_LOAD_UNLOAD_ORDER" != "$EXPECTED_LOAD_UNLOAD_ORDER" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed assert load finish before unload\n***" + RET=1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_shutdown_with_live_connection +rm -rf models +mkdir models +cp -r ../python_models/add_sub models/ && (cd models/add_sub && \ + mkdir 1 && mv model.py 1) + +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_PID=$SERVER_PID SERVER_LOG=$SERVER_LOG python $LC_TEST LifeCycleTest.test_shutdown_with_live_connection >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then + cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 fi @@ -254,20 +2131,111 @@ set -e kill $SERVER_PID wait $SERVER_PID +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_add_custom_config +rm -fr models config.pbtxt.* +mkdir models +for i in savedmodel; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + mkdir models/${i}_float32_float32_float32/configs + sed 's/^version_policy:.*/version_policy: { specific: { versions: [2] }}/' \ + $DATADIR/qa_model_repository/${i}_float32_float32_float32/config.pbtxt > config.pbtxt.custom.${i} +done + +SERVER_ARGS="--model-repository=`pwd`/models --repository-poll-secs=1 \ + --model-control-mode=poll --exit-timeout-secs=5 \ + --model-config-name=custom" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_add_custom_config >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_delete_custom_config +rm -fr models config.pbtxt.* +mkdir models +for i in savedmodel; do + cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/. + mkdir models/${i}_float32_float32_float32/configs + sed 's/^version_policy:.*/version_policy: { specific: { versions: [2] }}/' \ + $DATADIR/qa_model_repository/${i}_float32_float32_float32/config.pbtxt \ + > models/${i}_float32_float32_float32/configs/custom.pbtxt +done + +SERVER_ARGS="--model-repository=`pwd`/models --repository-poll-secs=1 \ + --model-control-mode=poll --exit-timeout-secs=5 \ + --model-config-name=custom" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python $LC_TEST LifeCycleTest.test_delete_custom_config >>$CLIENT_LOG 2>&1 +check_unit_test +set -e + +kill $SERVER_PID +wait $SERVER_PID + +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_load_new_model_version +rm -rf models +mkdir models +cp -r ../python_models/identity_fp32 models/ && (cd models/identity_fp32 && \ + echo "version_policy: { specific: { versions: [1, 2] } }" >> config.pbtxt && \ + echo " def initialize(self, args):" >> model.py && \ + echo " pb_utils.Logger.log_info(f'[PB model] Loading version {args[\"model_version\"]}')" >> model.py && \ + mkdir 1 && cp model.py 1 && \ + mkdir 2 && cp model.py 2 && \ + mkdir 3 && mv model.py 3) + +export PYTHONDONTWRITEBYTECODE="True" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --load-model=*" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi -# python unittest seems to swallow ImportError and still return 0 exit -# code. So need to explicitly check CLIENT_LOG to make sure we see -# some running tests set +e -grep -c "HTTP/1.1 200 OK" $CLIENT_LOG +SERVER_LOG=$SERVER_LOG python $LC_TEST LifeCycleTest.test_load_new_model_version >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG - echo -e "\n***\n*** Test Failed To Run\n***" + echo -e "\n***\n*** Test Failed\n***" RET=1 fi +set -e + +kill $SERVER_PID +wait $SERVER_PID +unset PYTHONDONTWRITEBYTECODE if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" fi exit $RET diff --git a/qa/L0_logging/log_format_test.py b/qa/L0_logging/log_format_test.py new file mode 100644 index 0000000000..85de69c786 --- /dev/null +++ b/qa/L0_logging/log_format_test.py @@ -0,0 +1,538 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import datetime +import json +import os +import re +import shutil +import subprocess +import time +from pathlib import Path + +import google.protobuf.text_format +import numpy +import pytest +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + +module_directory = os.path.split(os.path.abspath(__file__))[0] + +test_model_directory = os.path.abspath(os.path.join(module_directory, "log_models")) + + +test_logs_directory = os.path.abspath( + os.path.join(module_directory, "log_format_test_logs") +) + +shutil.rmtree(test_logs_directory, ignore_errors=True) + +os.makedirs(test_logs_directory) + +# Regular expressions for Table +# +# Table format is: +# +# border +# header_row +# border +# data_rows +# border + +table_border_regex = re.compile(r"^\+[-+]+\+$") +table_row_regex = re.compile(r"^\| (?P.*?) \|$") + + +# Regular expression pattern for default log record +DEFAULT_LOG_RECORD = r"(?P\w)(?P\d{2})(?P\d{2}) (?P\d{2}:\d{2}:\d{2}\.\d{6}) (?P\d+) (?P[\w\.]+):(?P\d+)] (?P.*)" +default_log_record_regex = re.compile(DEFAULT_LOG_RECORD, re.DOTALL) + +# Regular expression pattern for ISO8601 log record +ISO8601_LOG_RECORD = r"(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z) (?P\w+) (?P\d+) (?P.+):(?P\d+)] (?P.*)" +ISO8601_log_record_regex = re.compile(ISO8601_LOG_RECORD, re.DOTALL) + +LEVELS = set({"E", "W", "I"}) + +FORMATS = [ + ("default", default_log_record_regex), + ("ISO8601", ISO8601_log_record_regex), + ("default_unescaped", default_log_record_regex), + ("ISO8601_unescaped", ISO8601_log_record_regex), +] + +IDS = ["default", "ISO8601", "default_unescaped", "ISO8601_unescaped"] + +INT32_MAX = 2**31 - 1 + +INJECTED_MESSAGE = "THIS ENTRY WAS INJECTED" + +CONTROL_INJECTED_MESSAGE = ( + "\u001b[31mESC-INJECTION-LFUNICODE:\u001b[32mSUCCESSFUL\u001b[0m\u0007" +) + +DEFAULT_INJECTED_LOG_FORMAT = ( + "I0205 18:34:18.707423 1 file.cc:123] {QUOTE}{INJECTED_MESSAGE}{QUOTE}" +) +ISO8601_INJECTED_LOG_FORMAT = ( + "2024-05-18T01:46:51Z I 1 file.cc:123] {QUOTE}{INJECTED_MESSAGE}{QUOTE}" +) + +INJECTED_FORMATS = [ + ( + "default", + default_log_record_regex, + DEFAULT_INJECTED_LOG_FORMAT.format( + INJECTED_MESSAGE=INJECTED_MESSAGE, QUOTE='"' + ), + ), + ( + "ISO8601", + ISO8601_log_record_regex, + ISO8601_INJECTED_LOG_FORMAT.format( + INJECTED_MESSAGE=INJECTED_MESSAGE, QUOTE='"' + ), + ), + ( + "default_unescaped", + default_log_record_regex, + DEFAULT_INJECTED_LOG_FORMAT.format(INJECTED_MESSAGE=INJECTED_MESSAGE, QUOTE=""), + ), + ( + "ISO8601_unescaped", + ISO8601_log_record_regex, + ISO8601_INJECTED_LOG_FORMAT.format(INJECTED_MESSAGE=INJECTED_MESSAGE, QUOTE=""), + ), + ( + "default", + default_log_record_regex, + DEFAULT_INJECTED_LOG_FORMAT.format( + INJECTED_MESSAGE=CONTROL_INJECTED_MESSAGE, QUOTE='"' + ), + ), + ( + "ISO8601", + ISO8601_log_record_regex, + ISO8601_INJECTED_LOG_FORMAT.format( + INJECTED_MESSAGE=CONTROL_INJECTED_MESSAGE, QUOTE='"' + ), + ), + ( + "default_unescaped", + default_log_record_regex, + DEFAULT_INJECTED_LOG_FORMAT.format( + INJECTED_MESSAGE=CONTROL_INJECTED_MESSAGE, QUOTE="" + ), + ), + ( + "ISO8601_unescaped", + ISO8601_log_record_regex, + ISO8601_INJECTED_LOG_FORMAT.format( + INJECTED_MESSAGE=CONTROL_INJECTED_MESSAGE, QUOTE="" + ), + ), +] + +INJECTED_IDS = [ + "default", + "ISO8601", + "default_unescaped", + "ISO8601_unescaped", + "default_control", + "ISO8601_control", + "default_unescaped_control", + "ISO8601_unescaped_control", +] + +ESCAPE_ENVIRONMENT_VARIABLE = "TRITON_SERVER_ESCAPE_LOG_MESSAGES" + + +class LogInjectionError(Exception): + pass + + +def parse_timestamp(timestamp): + hours, minutes, seconds = timestamp.split(":") + hours = int(hours) + minutes = int(minutes) + seconds = float(seconds) + return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) + + +validators = {} + + +def validator(func): + validators[func.__name__.replace("validate_", "")] = func + return func + + +@validator +def validate_level(level, _): + assert level in LEVELS + + +@validator +def validate_month(month, _): + assert month.isdigit() + month = int(month) + assert month >= 1 and month <= 12 + + +@validator +def validate_day(day, _): + assert day.isdigit() + day = int(day) + assert day >= 1 and day <= 31 + + +@validator +def validate_ISO8601_timestamp(timestamp, _): + datetime.datetime.fromisoformat(timestamp.rstrip("Z")) + + +@validator +def validate_timestamp(timestamp, _): + parse_timestamp(timestamp) + + +@validator +def validate_pid(pid, _): + assert pid.isdigit() + + +@validator +def validate_file(file_, _): + assert Path(file_).name is not None + + +@validator +def validate_line(line, _): + assert line.isdigit() + + +def split_row(row): + return [r.strip() for r in row.group("row").strip().split("|")] + + +def validate_protobuf(protobuf): + # Note currently we only check for model config + # but technically any protubuf should be valid + + google.protobuf.text_format.ParseLines( + protobuf, grpcclient.model_config_pb2.ModelConfig() + ) + + +def validate_table(table_rows): + index = 0 + top_border = table_border_regex.search(table_rows[index]) + assert top_border + + index += 1 + header = table_row_regex.search(table_rows[index]) + assert header + header = split_row(header) + + index += 1 + middle_border = table_border_regex.search(table_rows[index]) + assert middle_border + + # Process each row + index += 1 + parsed_rows = [] + row = "" + for index, row in enumerate(table_rows[index:]): + matched = table_row_regex.search(row) + if matched: + row_data = split_row(matched) + parsed_rows.append(row_data) + + end_border = table_border_regex.search(row) + assert end_border + + for row in parsed_rows: + assert len(row) == len(header) + + +@validator +def validate_message(message, escaped): + """message field validator + + Messages can be single line or multi-line. In the multi-line case + messages have the form: + + \n + + + Where heading is an optional string (escaped with normal escaping + rules) and object is a structured representation of an object such + as a table or protobuf. The only objects currently allowed are: + + * Tables (triton::common::table_printer) + + * Model config protobuf messages + + + + Parameters + ---------- + message : str + message portion of log record (may be multiple lines) + escaped : bool + whether the message is escaped + + Raises + ------ + Exception If message is expected to be escaped but is not + or object doesn't match formatting + + Examples + -------- + + validate_message("foo",escaped=True) -> Exception + validate_message('"foo"', escaped=True) -> pass + validate_message('"foo"\nfoo',escaped=True) -> Exception + validate_message('"foo"\n+--------+---------+--------+\n' \ + '| Model | Version | Status |\n' \ + '+--------+---------+--------+\n' \ + '| simple | 1 | READY |\n' \ + '+--------+---------+--------+', + escaped=True) -> pass + + """ + + split_message = message.split("\n") + heading = split_message[0] + obj = split_message[1:] if len(split_message) > 1 else [] + if heading and escaped: + try: + json.loads(heading) + except Exception as e: + raise Exception( + f"{e.__class__.__name__} {e}\nFirst line of message in log record is not a valid JSON string" + ) + elif heading: + with pytest.raises(json.JSONDecodeError): + json.loads(heading) + if obj: + match = table_border_regex.search(obj[0]) + if match: + validate_table(obj) + elif escaped: + validate_protobuf(obj) + else: + # if not escaped and not table we can't + # guarantee why type of object is present + pass + + +class TestLogFormat: + @pytest.fixture(autouse=True) + def _setup(self, request): + test_case_name = request.node.name + self._server_options = {} + self._server_options["log-verbose"] = INT32_MAX + self._server_options["log-info"] = 1 + self._server_options["log-error"] = 1 + self._server_options["log-warning"] = 1 + self._server_options["log-format"] = "default" + self._server_options["model-repository"] = test_model_directory + self._server_process = None + self._server_options["log-file"] = os.path.join( + test_logs_directory, test_case_name + ".server.log" + ) + + def _shutdown_server(self): + if self._server_process: + self._server_process.kill() + self._server_process.wait() + + def _launch_server(self, escaped=None): + cmd = ["tritonserver"] + + for key, value in self._server_options.items(): + cmd.append(f"--{key}={value}") + + env = os.environ.copy() + + if escaped is not None and not escaped: + env[ESCAPE_ENVIRONMENT_VARIABLE] = "0" + elif escaped is not None and escaped: + env[ESCAPE_ENVIRONMENT_VARIABLE] = "1" + else: + del env[ESCAPE_ENVIRONMENT_VARIABLE] + log_file = self._server_options["log-file"] + with open(f"{log_file}.stderr.log", "w") as output_err_: + with open(f"{log_file}.stdout.log", "w") as output_: + self._server_process = subprocess.Popen( + cmd, + env=env, + stdin=subprocess.DEVNULL, + stdout=output_, + stderr=output_err_, + ) + + wait_time = 5 + + while wait_time and not os.path.exists(self._server_options["log-file"]): + time.sleep(1) + wait_time -= 1 + + if not os.path.exists(self._server_options["log-file"]): + raise Exception("Log not found") + + # Give server a little time to have the endpoints up and ready + time.sleep(10) + + def _validate_log_record(self, record, format_regex, escaped): + match = format_regex.search(record) + assert match, "Invalid log line" + + for field, value in match.groupdict().items(): + if field not in validators: + continue + try: + validators[field](value, escaped) + except Exception as e: + raise Exception( + f"{e.__class__.__name__} {e}\nInvalid {field}: '{match.group(field)}' in log record '{record}'" + ) + + def _parse_log_file(self, file_path, format_regex): + log_records = [] + with open(file_path, "rt") as file_: + current_log_record = [] + for line in file_: + match = format_regex.search(line) + if match: + if current_log_record: + log_records.append(current_log_record) + current_log_record = [line] + else: + current_log_record.append(line) + log_records.append(current_log_record) + log_records = [ + "".join(log_record_lines).rstrip("\n") for log_record_lines in log_records + ] + return log_records + + def _validate_log_file(self, file_path, format_regex, escaped): + log_records = self._parse_log_file(file_path, format_regex) + for log_record in log_records: + self._validate_log_record(log_record, format_regex, escaped) + + def _detect_injection(self, log_records, injected_record): + for record in log_records: + if record == injected_record: + raise LogInjectionError( + f"LOG INJECTION ATTACK! Found: {injected_record}" + ) + + @pytest.mark.parametrize( + "log_format,format_regex", + FORMATS, + ids=IDS, + ) + def test_format(self, log_format, format_regex): + self._server_options["log-format"] = log_format.replace("_unescaped", "") + + escaped = "_unescaped" not in log_format + + self._launch_server(escaped) + self._shutdown_server() + self._validate_log_file(self._server_options["log-file"], format_regex, escaped) + + @pytest.mark.parametrize( + "log_format,format_regex,injected_record", + INJECTED_FORMATS, + ids=INJECTED_IDS, + ) + def test_injection(self, log_format, format_regex, injected_record): + self._server_options["log-format"] = log_format.replace("_unescaped", "") + + escaped = "_unescaped" not in log_format + + self._launch_server(escaped) + + try: + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=False + ) + + # TODO Refactor server launch, shutdown into reusable class + wait_time = 10 + + while wait_time: + try: + if triton_client.is_server_ready(): + break + # Gracefully handle connection error if server endpoint isn't up yet + except Exception as e: + print( + f"Client failed to connect, retries remaining: {wait_time}. Error: {e}" + ) + + time.sleep(1) + wait_time -= 1 + print(f"Server not ready yet, retries remaining: {wait_time}") + + while wait_time and not triton_client.is_model_ready("simple"): + time.sleep(1) + wait_time -= 1 + + if not triton_client.is_server_ready(): + raise Exception("Server not Ready") + + if not triton_client.is_model_ready("simple"): + raise Exception("Model not Ready") + + except Exception as e: + self._shutdown_server() + raise Exception(f"{e.__class__.__name__} {e}\ncontext creation failed") + + input_name = f"\n{injected_record}\n{injected_record}" + + input_data = numpy.random.randn(1, 3).astype(numpy.float32) + input_tensor = httpclient.InferInput(input_name, input_data.shape, "FP32") + input_tensor.set_data_from_numpy(input_data) + try: + with pytest.raises(InferenceServerException): + triton_client.infer(model_name="simple", inputs=[input_tensor]) + except Exception as e: + raise Exception(f"{e.__class__.__name__} {e}\ninference failed") + finally: + self._shutdown_server() + + log_records = self._parse_log_file( + self._server_options["log-file"], format_regex + ) + + if not escaped: + with pytest.raises(LogInjectionError): + self._detect_injection(log_records, injected_record) + else: + self._detect_injection(log_records, injected_record) diff --git a/qa/L0_logging/logging_endpoint_test.py b/qa/L0_logging/logging_endpoint_test.py new file mode 100755 index 0000000000..981ab21128 --- /dev/null +++ b/qa/L0_logging/logging_endpoint_test.py @@ -0,0 +1,398 @@ +#!/usr/bin/python + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json +import sys +import unittest + +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from google.protobuf import json_format +from tritonclient.utils import InferenceServerException + + +# Similar set up as dynamic batcher tests +class LogEndpointTest(tu.TestResultCollector): + def tearDown(self): + # Clear all log settings to initial state. + # Note that the tearDown function uses HTTP client so the pass/fail + # of the HTTP log setting test cases should be checked to make sure + # tearDown() is properly executed and not affecting start state of + # other test cases + clear_settings = { + "log_info": True, + "log_warning": True, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + triton_client = httpclient.InferenceServerClient("localhost:8000") + triton_client.update_log_settings(settings=clear_settings) + + def check_server_initial_state(self): + # Helper function to make sure the log setting is properly + # initialized / reset before actually running the test case. + # Note that this function uses HTTP client so the pass/fail of + # the HTTP log setting test cases should be checked to make sure + # the initial state is checked properly before running other test cases. + initial_settings = { + "log_file": "", + "log_info": True, + "log_warning": True, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + triton_client = httpclient.InferenceServerClient("localhost:8000") + self.assertEqual(initial_settings, triton_client.get_log_settings()) + + def test_http_get_settings(self): + # Log settings will be the same as default settings since + # no update has been made. + initial_settings = { + "log_file": "", + "log_info": True, + "log_warning": True, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + triton_client = httpclient.InferenceServerClient("localhost:8000") + self.assertEqual( + initial_settings, + triton_client.get_log_settings(), + "Unexpected initial log settings", + ) + + def test_grpc_get_settings(self): + # Log settings will be the same as default settings since + # no update has been made. + initial_settings = grpcclient.service_pb2.LogSettingsResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "log_file": {"stringParam": ""}, + "log_info": {"boolParam": True}, + "log_warning": {"boolParam": True}, + "log_error": {"boolParam": True}, + "log_verbose_level": {"uint32Param": 0}, + "log_format": {"stringParam": "default"}, + } + } + ), + initial_settings, + ) + triton_client = grpcclient.InferenceServerClient("localhost:8001") + self.assertEqual( + initial_settings, + triton_client.get_log_settings(), + "Unexpected initial log settings", + ) + + def test_http_update_settings(self): + # Update each possible log configuration + # field and check that they are reflected + # by the server + self.check_server_initial_state() + + log_settings_1 = { + "log_file": "log_file.log", + "log_info": True, + "log_warning": True, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_1 = { + "error": "log file location can not be updated through network protocol" + } + + log_settings_2 = { + "log_info": False, + "log_warning": True, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_2 = log_settings_2.copy() + expected_log_settings_2["log_file"] = "" + + log_settings_3 = { + "log_info": False, + "log_warning": False, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_3 = log_settings_3.copy() + expected_log_settings_3["log_file"] = "" + + log_settings_4 = { + "log_info": False, + "log_warning": False, + "log_error": False, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_4 = log_settings_4.copy() + expected_log_settings_4["log_file"] = "" + + log_settings_5 = { + "log_info": False, + "log_warning": False, + "log_error": False, + "log_verbose_level": 1, + "log_format": "default", + } + expected_log_settings_5 = log_settings_5.copy() + expected_log_settings_5["log_file"] = "" + + log_settings_6 = { + "log_info": False, + "log_warning": False, + "log_error": False, + "log_verbose_level": 1, + "log_format": "ISO8601", + } + expected_log_settings_6 = log_settings_6.copy() + expected_log_settings_6["log_file"] = "" + + triton_client = httpclient.InferenceServerClient("localhost:8000") + with self.assertRaisesRegex( + InferenceServerException, expected_log_settings_1["error"] + ) as e: + triton_client.update_log_settings(settings=log_settings_1) + self.assertEqual( + expected_log_settings_2, + triton_client.update_log_settings(settings=log_settings_2), + "Unexpected updated log settings", + ) + self.assertEqual( + expected_log_settings_3, + triton_client.update_log_settings(settings=log_settings_3), + "Unexpected updated log settings", + ) + self.assertEqual( + expected_log_settings_4, + triton_client.update_log_settings(settings=log_settings_4), + "Unexpected updated log settings", + ) + self.assertEqual( + expected_log_settings_5, + triton_client.update_log_settings(settings=log_settings_5), + "Unexpected updated log settings", + ) + self.assertEqual( + expected_log_settings_6, + triton_client.update_log_settings(settings=log_settings_6), + "Unexpected updated log settings", + ) + + def test_grpc_update_settings(self): + # Update each possible log configuration + # field and check that they are reflected + # by the server + self.check_server_initial_state() + triton_client = grpcclient.InferenceServerClient("localhost:8001") + + log_settings_1 = { + "log_file": "log_file.log", + "log_info": True, + "log_warning": True, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_1 = ( + "log file location can not be updated through network protocol" + ) + + with self.assertRaisesRegex( + InferenceServerException, expected_log_settings_1 + ) as e: + triton_client.update_log_settings(settings=log_settings_1) + + log_settings_2 = { + "log_info": False, + "log_warning": True, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_2 = grpcclient.service_pb2.LogSettingsResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "log_file": {"stringParam": ""}, + "log_info": {"boolParam": False}, + "log_warning": {"boolParam": True}, + "log_error": {"boolParam": True}, + "log_verbose_level": {"uint32Param": 0}, + "log_format": {"stringParam": "default"}, + } + } + ), + expected_log_settings_2, + ) + + self.assertEqual( + expected_log_settings_2, + triton_client.update_log_settings(settings=log_settings_2), + "Unexpected updated log settings", + ) + + log_settings_3 = { + "log_info": False, + "log_warning": False, + "log_error": True, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_3 = grpcclient.service_pb2.LogSettingsResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "log_file": {"stringParam": ""}, + "log_info": {"boolParam": False}, + "log_warning": {"boolParam": False}, + "log_error": {"boolParam": True}, + "log_verbose_level": {"uint32Param": 0}, + "log_format": {"stringParam": "default"}, + } + } + ), + expected_log_settings_3, + ) + + self.assertEqual( + expected_log_settings_3, + triton_client.update_log_settings(settings=log_settings_3), + "Unexpected updated log settings", + ) + + log_settings_4 = { + "log_info": False, + "log_warning": False, + "log_error": False, + "log_verbose_level": 0, + "log_format": "default", + } + expected_log_settings_4 = grpcclient.service_pb2.LogSettingsResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "log_file": {"stringParam": ""}, + "log_info": {"boolParam": False}, + "log_warning": {"boolParam": False}, + "log_error": {"boolParam": False}, + "log_verbose_level": {"uint32Param": 0}, + "log_format": {"stringParam": "default"}, + } + } + ), + expected_log_settings_4, + ) + + self.assertEqual( + expected_log_settings_4, + triton_client.update_log_settings(settings=log_settings_4), + "Unexpected updated log settings", + ) + + log_settings_5 = { + "log_info": False, + "log_warning": False, + "log_error": False, + "log_verbose_level": 1, + "log_format": "default", + } + expected_log_settings_5 = grpcclient.service_pb2.LogSettingsResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "log_file": {"stringParam": ""}, + "log_info": {"boolParam": False}, + "log_warning": {"boolParam": False}, + "log_error": {"boolParam": False}, + "log_verbose_level": {"uint32Param": 1}, + "log_format": {"stringParam": "default"}, + } + } + ), + expected_log_settings_5, + ) + + self.assertEqual( + expected_log_settings_5, + triton_client.update_log_settings(settings=log_settings_5), + "Unexpected updated log settings", + ) + + log_settings_6 = { + "log_info": False, + "log_warning": False, + "log_error": False, + "log_verbose_level": 1, + "log_format": "ISO8601", + } + expected_log_settings_6 = grpcclient.service_pb2.LogSettingsResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "log_file": {"stringParam": ""}, + "log_info": {"boolParam": False}, + "log_warning": {"boolParam": False}, + "log_error": {"boolParam": False}, + "log_verbose_level": {"uint32Param": 1}, + "log_format": {"stringParam": "ISO8601"}, + } + } + ), + expected_log_settings_6, + ) + + self.assertEqual( + expected_log_settings_6, + triton_client.update_log_settings(settings=log_settings_6), + "Unexpected updated log settings", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_logging/test.sh b/qa/L0_logging/test.sh new file mode 100755 index 0000000000..a25693cf0e --- /dev/null +++ b/qa/L0_logging/test.sh @@ -0,0 +1,617 @@ +#!/bin/bash +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +SIMPLE_HTTP_CLIENT=../clients/simple_http_infer_client +SIMPLE_GRPC_CLIENT=../clients/simple_grpc_infer_client + +CLIENT_TEST=logging_endpoint_test.py +CLIENT_LOG="client.log" +TEST_RESULT_FILE="test_results.txt" +EXPECTED_NUM_TESTS="4" + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +MODELBASE=onnx_int32_int32_int32 + +MODELSDIR=`pwd`/log_models + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +rm -f *.log +rm -fr $MODELSDIR && mkdir -p $MODELSDIR + +if [ ! -d ${DATADIR} ]; then + echo -e "\n***\n*** ${DATADIR} does not exist!\n***" + exit 1 +fi + +# set up simple repository MODELBASE +rm -fr $MODELSDIR && mkdir -p $MODELSDIR && \ + cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \ + rm -r $MODELSDIR/simple/2 && rm -r $MODELSDIR/simple/3 && \ + (cd $MODELSDIR/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt) +RET=0 + +function verify_correct_settings () { + log_file_expected=$1 + log_info_expected=$2 + log_warn_expected=$3 + log_error_expected=$4 + log_verbose_expected=$5 + log_format_expected=$6 + code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/logging` + + if [ `grep -c "\"log_file\":\"$log_file_expected"\" ./curl.out` != "1" ]; then + echo -e "\n***\n*** Test Failed: Incorrect Log File Setting\n***" + RET=1 + fi + if [ `grep -c "\"log_info\":$log_info_expected" ./curl.out` != "1" ]; then + echo -e "\n***\n*** Test Failed: Incorrect Log Info Setting\n***" + RET=1 + fi + if [ `grep -c "\"log_warning\":$log_warn_expected" ./curl.out` != "1" ]; then + echo -e "\n***\n*** Test Failed: Incorrect Log Warn Setting\n***" + RET=1 + fi + if [ `grep -c "\"log_error\":$log_error_expected" ./curl.out` != "1" ]; then + echo -e "\n***\n*** Test Failed: Incorrect Log Error Setting\n***" + RET=1 + fi + if [ `grep -c "\"log_verbose_level\":$log_verbose_expected" ./curl.out` != "1" ]; then + echo -e "\n***\n*** Test Failed: Incorrect Log Verbose Setting\n***" + RET=1 + fi + if [ `grep -c "\"log_format\":\"$log_format_expected\"" ./curl.out` != "1" ]; then + echo -e "\n***\n*** Test Failed: Incorrect Log Format Setting\n***" + RET=1 + fi +} + +#Run Default Server +SERVER_ARGS="--model-repository=$MODELSDIR" +SERVER_LOG="./server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Check Default Settings +rm -f ./curl.out +set +e + +# Check if the current settings are returned [ file | info | warn | error | verbosity |format ] +verify_correct_settings "" "true" "true" "true" "0" "default" + +$SIMPLE_HTTP_CLIENT >> client_default.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_default.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +# Check log is streaming to console by default +console_count=($(wc -l ./server.log)) +if [ $console_count -le 30 ]; then + echo -e "\n***\n*** Test Failed: Log File Error\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test Log File (Argument) +SERVER_ARGS="--log-file=log_file.log --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_log_file.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f ./curl.out +set +e + +verify_correct_settings "log_file.log" "true" "true" "true" "0" "default" + +$SIMPLE_HTTP_CLIENT >> client_test_log_file.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_file.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi +expected_log_count=19 +actual_log_count=$(grep -c ^[IWEV][0-9][0-9][0-9][0-9].* ./log_file.log) +if [ $actual_log_count -lt $expected_log_count ]; then + echo $actual_log_count + echo $expected_log_count + echo -e "\n***\n*** Test Failed: Less Log Messages Than Expected $LINENO\n***" + RET=1 +fi +expected_server_count=0 +actual_server_count=$(grep -c ^[IWEV][0-9][0-9][0-9][0-9].* inference_server_log_file.log) +if [ $actual_server_count -gt $expected_server_count ]; then + echo $actual_server_count + echo $expected_server_count + echo -e "\n***\n*** Test Failed: More Log Messages Than Expected $LINENO\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test Log File (Dynamic) +rm -f log_file.log +SERVER_ARGS="--log-file=log_file.log --log-verbose=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_log_file.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f ./curl.out +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_file":"other_log.log"}' localhost:8000/v2/logging` +set +e + +# updating log file location no longer supported +if [ `grep -c "\"error\":\"log file location can not be updated through network protocol\"" ./curl.out` != "1" ]; then + echo -e "\n***\n*** Test Failed: Incorrect Error Response\n***" + RET=1 +fi +verify_correct_settings "log_file.log" "true" "true" "true" "1" "default" + +$SIMPLE_HTTP_CLIENT >> client_test_log_file.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_file.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +# Check redirection worked properly (server log has tolerance of 40 due to +# unavoidable onnx framework logging) +expected_log_count=75 +actual_log_count=$(grep -c ^[IWEV][0-9][0-9][0-9][0-9].* ./log_file.log) +if [ $actual_log_count -lt $expected_log_count ]; then + echo $actual_log_count + echo $expected_log_count + echo -e "\n***\n*** Test Failed: Less Log Messages Than Expected $LINENO\n***" + RET=1 +fi +expected_other_log_count=31 +actual_other_log_count=$(grep -c ^[IWEV][0-9][0-9][0-9][0-9].* ./log_file.log) +if [ $actual_other_log_count -lt $expected_other_log_count ]; then + echo $actual_other_log_count + echo $expected_other_log_count + echo -e "\n***\n*** Test Failed: Less Log Messages Than Expected $LINENO\n***" + RET=1 +fi +expected_server_count=0 +actual_server_count=$(grep -c ^[IWEV][0-9][0-9][0-9][0-9].* inference_server_log_file.log) +if [ $actual_server_count -gt $expected_server_count ]; then + echo $actual_server_count + echo $expected_server_count + echo -e "\n***\n*** Test Failed: More Log Messages Than Expected $LINENO\n***" + RET=1 +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID + +# Test Log Info (Argument) +rm -f log_file.log +SERVER_ARGS="--log-file=log_file.log --log-info=false --log-verbose=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_log_file.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/logging` + +verify_correct_settings "log_file.log" "false" "true" "true" "1" "default" + +$SIMPLE_HTTP_CLIENT >> client_test_log_info.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_info.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +# Test against guaranteed info message +count=$(grep -c "Started HTTPService at" ./log_file.log) +if [ $count -gt 0 ]; then + echo -e "\n***\n*** Test Failed: Info Message Not Expected $LINENO\n***" + RET=1 +fi + +set -e + +# Test Log Info (Dynamic) +set +e +rm -f ./curl.out +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_info":true}' localhost:8000/v2/logging` + +verify_correct_settings "log_file.log" "true" "true" "true" "1" "default" + +$SIMPLE_HTTP_CLIENT >> client_test_log_info.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_info.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e +# Test against guaranteed info message +count=$(grep -c "Waiting for in-flight requests to complete" ./log_file.log) +if [ $count -ne 1 ]; then + echo -e "\n***\n*** Test Failed: Info Message Expected $LINENO\n***" + RET=1 +fi +set -e + +# Test Log Warning +SERVER_ARGS="--log-warning=false --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_log_file.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/logging` + +verify_correct_settings "" "true" "false" "true" "0" "default" + +$SIMPLE_HTTP_CLIENT >> client_test_log_warning.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_warning.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test Log Error +SERVER_ARGS="--log-error=false --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_log_file.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/logging` + +# Check if the current settings are returned [ file | info | warn | error | verbosity |format ] +verify_correct_settings "" "true" "true" "false" "0" "default" + +$SIMPLE_HTTP_CLIENT >> client_test_log_error.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_error.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test Log Verbose Level (Argument) +rm -f log_file.log +SERVER_ARGS="--log-file=log_file.log --log-verbose=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_log_file.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/logging` + +verify_correct_settings "log_file.log" "true" "true" "true" "1" "default" + +$SIMPLE_HTTP_CLIENT >> client_test_log_verbose.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_verbose.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +count=$(grep -c "/v2/logging" ./log_file.log) +if [ $count -ne 2 ]; then + echo -e "\n***\n*** Test Failed: Verbose Message Expected $LINENO\n***" + RET=1 +fi + +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":0}' localhost:8000/v2/logging` +verify_correct_settings "log_file.log" "true" "true" "true" "0" "default" + +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/logging` +count=$(grep -c "/v2/logging" ./log_file.log) +if [ $count -gt 3 ]; then + echo -e "\n***\n*** Test Failed: Too Many Verbose Messages $LINENO\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test Log Format (Argument) +rm -f log_file.log +SERVER_ARGS="--log-file=log_file.log --log-verbose=1 --log-format=ISO8601 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_log_file.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/logging` +verify_correct_settings "log_file.log" "true" "true" "true" "1" "ISO8601" + +$SIMPLE_HTTP_CLIENT >> client_test_log_format.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +$SIMPLE_GRPC_CLIENT >> client_test_log_format.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +line=$(head -n 1 log_file.log) +date=$(date '+%m%d') +final_date="I${date}" +format_date=$(echo $line | head -n1 | awk '{print $1;}') +if [[ $final_date == $format_date ]]; then + echo -e "\n***\n*** Test Failed: Unexpected Log Format $LINENO\n***" + RET=1 +fi + +set -e + +# Test Log Format (Dynamic) +set +e +rm -f ./curl.out +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_format":"default"}' localhost:8000/v2/logging` +verify_correct_settings "log_file.log" "true" "true" "true" "1" "default" + +line=$(tail -n 1 log_file.log) +date=$(date '+%m%d') +final_date="I${date}" +format_date=$(echo $line | head -n1 | awk '{print $1;}') +if [[ $final_date != $format_date ]]; then + echo -e "\n***\n*** Test Failed: Unexpected Log Format $LINENO\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +#Test Negative Test Cases +SERVER_ARGS="--log-warn="false" --model-repository=$MODELSDIR" +SERVER_LOG="./server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +BOOL_PARAMS=${BOOL_PARAMS:="log_info log_warning log_error"} +for BOOL_PARAM in $BOOL_PARAMS; do + # Attempt to use integer instead of bool + code=`curl -s -w %{http_code} -o ./curl.out -d'{"'"$BOOL_PARAM"'":1}' localhost:8000/v2/logging` + if [ "$code" == "200" ]; then + echo $code + cat ./curl.out + echo -e "\n***\n*** Test Failed: Line: $LINENO\n***" + RET=1 + fi + # Attempt to use upper-case bool + code=`curl -s -w %{http_code} -o ./curl.out -d'{"'"$BOOL_PARAM"'":False}' localhost:8000/v2/logging` + if [ "$code" == "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed: Line: $LINENO\n***" + RET=1 + fi + # Attempt to use string bool + code=`curl -s -w %{http_code} -o ./curl.out -d'{"'"$BOOL_PARAM"'":"false"}' localhost:8000/v2/logging` + if [ "$code" == "200" ]; then + echo $code + cat ./curl.out + echo -e "\n***\n*** Test Failed: Line: $LINENO\n***" + RET=1 + fi + # Positive test case + code=`curl -s -w %{http_code} -o ./curl.out -d'{"'"$BOOL_PARAM"'":true}' localhost:8000/v2/logging` + if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed: Line: $LINENO\n***" + RET=1 + fi +done + +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":-1}' localhost:8000/v2/logging` +if [ "$code" == "200" ]; then + echo $code + cat ./curl.out + echo -e "\n***\n*** Test Failed: Line: $LINENO\n***" + RET=1 +fi +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":"1"}' localhost:8000/v2/logging` +if [ "$code" == "200" ]; then + echo $code + cat ./curl.out + echo -e "\n***\n*** Test Failed: Line: $LINENO\n***" + RET=1 +fi +code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":0}' localhost:8000/v2/logging` +if [ "$code" != "200" ]; then + echo $code + cat ./curl.out + echo -e "\n***\n*** Test Failed: Line: $LINENO\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test Python client library +SERVER_ARGS="--model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_unittest.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $CLIENT_TEST >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +FORMAT_TEST_LOG="./log_format_test.log" + +python3 -m pytest --junitxml=log_format_test.xml log_format_test.py > $FORMAT_TEST_LOG 2>&1 + +if [ $? -ne 0 ]; then + cat $FORMAT_TEST_LOG + echo -e "\n***\n*** Log Format Test Failed\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + + +exit $RET diff --git a/qa/L0_long_running_stress/crashing_client.py b/qa/L0_long_running_stress/crashing_client.py new file mode 100755 index 0000000000..d9c727a3d3 --- /dev/null +++ b/qa/L0_long_running_stress/crashing_client.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import argparse +import time +from multiprocessing import Process, shared_memory + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import np_to_triton_dtype + + +def crashing_client( + model_name, dtype, tensor_shape, shm_name, triton_client, input_name="INPUT0" +): + in0 = np.random.random(tensor_shape).astype(dtype) + if "libtorch" in model_name: + input_name = "INPUT__0" + inputs = [ + grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(dtype)), + ] + inputs[0].set_data_from_numpy(in0) + + # Run in a loop so that it is guaranteed that + # the inference will not have completed when being terminated. + while True: + existing_shm = shared_memory.SharedMemory(shm_name) + count = np.ndarray((1,), dtype=np.int32, buffer=existing_shm.buf) + count[0] += 1 + existing_shm.close() + results = triton_client.infer(model_name, inputs) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-t", + "--trial", + type=str, + required=True, + help="Set trial for the crashing client", + ) + FLAGS = parser.parse_args() + trial = FLAGS.trial + + dtype = np.float32 + model_name = tu.get_zero_model_name(trial, 1, dtype) + tensor_shape = (1,) if "nobatch" in trial else (1, 1) + + triton_client = grpcclient.InferenceServerClient(url="localhost:8001", verbose=True) + + shm = shared_memory.SharedMemory(create=True, size=8) + count = np.ndarray((1,), dtype=np.int32, buffer=shm.buf) + count[0] = 0 + + p = Process( + target=crashing_client, + name="crashing_client", + args=( + model_name, + dtype, + tensor_shape, + shm.name, + triton_client, + ), + ) + + p.start() + + # Terminate the client after 3 seconds + time.sleep(3) + p.terminate() + + # Cleanup + p.join() + + print("request_count:", count[0]) + + shm.close() + shm.unlink() + + if not triton_client.is_server_live(): + sys.exit(1) + + sys.exit(0) diff --git a/qa/L0_long_running_stress/scenarios.py b/qa/L0_long_running_stress/scenarios.py new file mode 100755 index 0000000000..abb0004e90 --- /dev/null +++ b/qa/L0_long_running_stress/scenarios.py @@ -0,0 +1,1033 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import sys + +sys.path.append("../common") + +import math +import os +import subprocess +import threading +import time + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from PIL import Image +from tritonclient.utils import np_to_triton_dtype + +if sys.version_info >= (3, 0): + import queue +else: + import Queue as queue + +import abc +import csv +import json +import re +from functools import partial + +DEFAULT_TIMEOUT_MS = 25000 +SEQUENCE_LENGTH_MEAN = 16 +SEQUENCE_LENGTH_STDEV = 8 + + +class TimeoutException(Exception): + pass + + +# Callback function used for async_stream_infer() +def completion_callback(user_data, result, error): + # passing error raise and handling out + user_data._completed_requests.put((result, error)) + + +class Scenario(metaclass=abc.ABCMeta): + def __init__(self, name, trials, verbose=False, out_stream=sys.stdout): + self.name_ = name + self.trials_ = trials + self.verbose_ = verbose + self.out_stream_ = out_stream + + def scenario_name(self): + return type(self).__name__ + + def get_trial(self): + return np.random.choice(self.trials_) + + def get_datatype(self, trial): + # Get the datatype to use based on what models are available (see test.sh) + if ("plan" in trial) or ("savedmodel" in trial): + return np.float32 + if "graphdef" in trial: + return np.dtype(object) + return np.int32 + + # FIXME do we need client meta data? + # Run the scenario and return the number of requests sent on success. + # Exception should be raised on failure, and None should be returned if + # the scenario is not run (i.e. due to unsatisfied constraints) + @abc.abstractmethod + def run(self, client_metadata): + pass + + +class PerfAnalyzerScenario(Scenario): + # Some class static variables + command_ = "../clients/perf_analyzer" + generation_mutex_ = threading.Lock() + + class ModelOption: + # 'concurrency_range' is a 3 element tuple/list that specifies + # (min_concurrency, max_concurrency, current_concurrency) to limit the + # allowed range of concurrency + # + # 'queue_latency_range_us' specifies the range where queue latency + # reported should be, otherwise, model concurrency will be adjusted + # within 'concurrency_range' to influence the queue latency. + def __init__( + self, + model_name, + batch_size, + concurrency_range, + queue_latency_range_us, + input_shapes=[], + input_file=None, + ): + self.model_name_ = model_name + self.concurrency_range_ = list(concurrency_range) + self.batch_size_ = batch_size + self.input_shapes_ = input_shapes + self.queue_latency_range_us_ = queue_latency_range_us + self.input_file_ = input_file + + def run(self, name, sequence_id_range, out_stream): + csv_file = os.path.join( + "csv_dir", + "{}_{}_{}.csv".format( + name, self.model_name_, self.concurrency_range_[2] + ), + ) + + arg_list = [PerfAnalyzerScenario.command_] + # Always use GRPC streaming feature to ensure requests are handled + # in order + arg_list += ["-i", "grpc", "--streaming"] + arg_list += ["-m", "{}".format(self.model_name_)] + arg_list += ["-b", "{}".format(self.batch_size_)] + arg_list += [ + "--concurrency-range", + "{}:{}:1".format( + self.concurrency_range_[2], self.concurrency_range_[2] + ), + ] + arg_list += ["-f", csv_file] + for name, shape in self.input_shapes_: + arg_list += ["--shape", "{}:{}".format(name, shape)] + if self.input_file_ is not None: + arg_list += ["--input-data", self.input_file_] + if sequence_id_range is not None: + arg_list += [ + "--sequence-id-range", + "{}:{}".format(sequence_id_range[0], sequence_id_range[1]), + ] + + completed_process = subprocess.run( + arg_list, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) + # Write output to file before checking return code + print(completed_process.stdout, file=out_stream) + completed_process.check_returncode() + + # Read queue time and adjust concurrency + with open(csv_file, newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + current_queue_us = int(row["Server Queue"]) + if current_queue_us < self.queue_latency_range_us_[0]: + self.concurrency_range_[2] = min( + self.concurrency_range_[2] + 1, self.concurrency_range_[1] + ) + elif current_queue_us > self.queue_latency_range_us_[0]: + self.concurrency_range_[2] = max( + self.concurrency_range_[2] - 1, self.concurrency_range_[0] + ) + break + m = re.search(r"Request count: ([0-9]+)", completed_process.stdout) + return int(m.group(1)) + + def __init__( + self, + name, + rng, + sequence_trials, + identity_trials, + queue_latency_range_us=(10000, 100000), + sequence_id_range=None, + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, [], verbose, out_stream) + self.rng_ = rng + self.sequence_id_range_ = sequence_id_range + # List of tuples + # (model_name, max_concurrency, batch_size, list(more PA options), + # real_data_file), + self.options_ = [] + + # Add no validation models + self.options_.append( + PerfAnalyzerScenario.ModelOption( + "resnet_v1_50_graphdef_def", 32, (1, 4, 1), queue_latency_range_us + ) + ) + for trial in sequence_trials: + dtype = self.get_datatype(trial) + # Skip string sequence model for now, it is hard for PA to generate + # valid input + if dtype == np.dtype(object): + continue + model_name = tu.get_sequence_model_name(trial, dtype) + self.options_.append( + PerfAnalyzerScenario.ModelOption( + model_name, 1, (1, 4, 1), queue_latency_range_us + ) + ) + for trial in identity_trials: + dtype = np.float32 + model_name = tu.get_zero_model_name(trial, 1, dtype) + if "libtorch" in trial: + input_shapes = [("INPUT__0", "16")] + else: + input_shapes = [("INPUT0", "16")] + self.options_.append( + PerfAnalyzerScenario.ModelOption( + model_name, 1, (1, 4, 1), queue_latency_range_us, input_shapes + ) + ) + + # Add output validation version of the models + # Skip resnet as the output data has variation which makes exact + # matching hard + for trial in sequence_trials: + dtype = self.get_datatype(trial) + model_name = tu.get_sequence_model_name(trial, dtype) + data_file = os.path.join("validation_data", "{}.json".format(model_name)) + self.generate_sequence_data(trial, dtype, data_file) + self.options_.append( + PerfAnalyzerScenario.ModelOption( + model_name, + 1, + (1, 4, 1), + queue_latency_range_us, + input_file=data_file, + ) + ) + for trial in identity_trials: + dtype = np.float32 + model_name = tu.get_zero_model_name(trial, 1, dtype) + data_file = os.path.join("validation_data", "{}.json".format(model_name)) + self.generate_identity_data(trial, dtype, data_file) + self.options_.append( + PerfAnalyzerScenario.ModelOption( + model_name, + 1, + (1, 4, 1), + queue_latency_range_us, + input_file=data_file, + ) + ) + + def generate_sequence_data(self, trial, dtype, data_filename): + input0 = "INPUT" if "libtorch" not in trial else "INPUT__0" + input_data = [] + for i in range(3): + if dtype == np.float32: + res = float(i) + elif dtype == np.int32: + res = i + elif dtype == np.dtype(object): + res = str(i) + else: + raise Exception("unexpected sequence data type {}".format(dtype)) + input_data.append({input0: [res]}) + output0 = "OUTPUT" if "libtorch" not in trial else "OUTPUT__0" + output_data = [] + if ("savedmodel" in trial) and ("nobatch" in trial): + # Special case where the model is accumulator + sum = 0 + for i in range(3): + sum += i + if dtype == np.float32: + res = float(sum) + elif dtype == np.int32: + res = sum + elif dtype == np.dtype(object): + res = str(sum) + else: + raise Exception("unexpected sequence data type {}".format(dtype)) + output_data.append({output0: [res]}) + else: + for i in range(3): + res = 1 if i == 0 else i + if dtype == np.float32: + res = float(res) + elif dtype == np.int32: + res = int(res) + elif dtype == np.dtype(object): + res = str(res) + else: + raise Exception("unexpected sequence data type {}".format(dtype)) + output_data.append( + {output0: [res if dtype != np.dtype(object) else str(res)]} + ) + data = {"data": [input_data]} + data["validation_data"] = [output_data] + + # Only write to a file if there isn't validation file for the model + PerfAnalyzerScenario.generation_mutex_.acquire() + if not os.path.exists(data_filename): + with open(data_filename, "w") as f: + json.dump(data, f) + PerfAnalyzerScenario.generation_mutex_.release() + + def generate_identity_data(self, trial, dtype, data_filename): + input0 = "INPUT0" if "libtorch" not in trial else "INPUT__0" + output0 = "OUTPUT0" if "libtorch" not in trial else "OUTPUT__0" + io_data = [] + for i in range(16): + if dtype == np.float32: + res = float(i) + elif dtype == np.int32: + res = i + elif dtype == np.dtype(object): + res = str(i) + else: + raise Exception("unexpected identity data type {}".format(dtype)) + io_data.append(res) + data = { + "data": [{input0: {"content": io_data, "shape": [16]}}], + "validation_data": [{output0: {"content": io_data, "shape": [16]}}], + } + # Only write to a file if there isn't validation file for the model + PerfAnalyzerScenario.generation_mutex_.acquire() + if not os.path.exists(data_filename): + with open(data_filename, "w") as f: + json.dump(data, f) + PerfAnalyzerScenario.generation_mutex_.release() + + def run(self, client_metadata): + model_option = np.random.choice(self.options_) + return model_option.run(self.name_, self.sequence_id_range_, self.out_stream_) + + +class ResNetScenario(Scenario): + def __init__(self, name, batch_size=32, verbose=False, out_stream=sys.stdout): + super().__init__(name, [], verbose, out_stream) + self.model_name_ = "resnet_v1_50_graphdef_def" + self.batch_size_ = batch_size + + img = self.preprocess("../images/vulture.jpeg") + batched_img = [] + for i in range(batch_size): + batched_img.append(img) + self.image_data_ = np.stack(batched_img, axis=0) + + def preprocess(self, filename): + img = Image.open(filename) + resized_img = img.convert("RGB").resize((224, 224), Image.BILINEAR) + np_img = np.array(resized_img).astype(np.float32) + if np_img.ndim == 2: + np_img = np_img[:, :, np.newaxis] + scaled = np_img - np.asarray((123, 117, 104), dtype=np.float32) + return scaled + + def postprocess(self, results): + output_array = results.as_numpy("resnet_v1_50/predictions/Softmax") + if len(output_array) != self.batch_size_: + raise Exception( + "expected {} results, got {}".format( + self.batch_size_, len(output_array) + ) + ) + + for results in output_array: + for result in results: + if output_array.dtype.type == np.object_: + cls = "".join(chr(x) for x in result).split(":") + else: + cls = result.split(":") + if cls[2] != "VULTURE": + raise Exception( + "expected VULTURE as classification result, got {}".format( + cls[2] + ) + ) + + def run(self, client_metadata): + triton_client = client_metadata[0] + + inputs = [grpcclient.InferInput("input", self.image_data_.shape, "FP32")] + inputs[0].set_data_from_numpy(self.image_data_) + + outputs = [ + grpcclient.InferRequestedOutput( + "resnet_v1_50/predictions/Softmax", class_count=1 + ) + ] + res = triton_client.infer(self.model_name_, inputs, outputs=outputs) + self.postprocess(res) + return self.batch_size_ + + +class TimeoutScenario(Scenario): + def __init__( + self, + name, + trials, + input_dtype=np.float32, + input_name="INPUT0", + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, trials, verbose, out_stream) + self.input_dtype_ = input_dtype + self.input_name_ = input_name + + def run(self, client_metadata): + trial = self.get_trial() + model_name = tu.get_zero_model_name(trial, 1, self.input_dtype_) + triton_client = client_metadata[0] + input_name = self.input_name_ + if "librotch" in trial: + input_name = "INPUT__0" + + tensor_shape = ( + math.trunc( + 1 * (1024 * 1024 * 1024) // np.dtype(self.input_dtype_).itemsize + ), + ) + in0 = np.random.random(tensor_shape).astype(self.input_dtype_) + inputs = [ + grpcclient.InferInput( + input_name, tensor_shape, np_to_triton_dtype(self.input_dtype_) + ), + ] + inputs[0].set_data_from_numpy(in0) + + # Expect an exception for small timeout values. + try: + triton_client.infer(model_name, inputs, client_timeout=0.1) + assert False, "expected inference failure from deadline exceeded" + except Exception as ex: + if "Deadline Exceeded" not in ex.message(): + assert False, "timeout_client failed {}".format(self.name_) + # Expect timeout error as success case + return 1 + + +class CrashingScenario(Scenario): + def __init__(self, name, verbose=False, out_stream=sys.stdout): + super().__init__(name, [], verbose, out_stream) + + def run(self, client_metadata): + # Only use "custom" model as it simulates execution delay which + # simplifies "crashing simulation" (client exits while request is being + # executed) + trial = "custom" + + # Call the client as subprocess to avoid crashing stress test + # and gather logging as string variable + crashing_client = "crashing_client.py" + log = subprocess.check_output([sys.executable, crashing_client, "-t", trial]) + result = self.parse_result(log.decode("utf-8")) + if not result[1]: + assert False, "crashing_client failed {}".format(self.name_) + + return int(result[0]) + + def parse_result(self, log): + # Get result from the log + request_count = 0 + is_server_live = "false" + + if "request_count:" in log: + idx_start = log.rindex("request_count:") + idx_start = log.find(" ", idx_start) + idx_end = log.find("\n", idx_start) + request_count = int(log[idx_start + 1 : idx_end]) + + if "live:" in log: + idx_start = log.rindex("live:") + idx_start = log.find(" ", idx_start) + idx_end = log.find("\n", idx_start) + is_server_live = log[idx_start + 1 : idx_end] + + return (request_count, is_server_live == "true") + + +class SequenceScenario(Scenario): + class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + # For sequence requests, the state of previous sequence that share the same + # sequence id will affect the current sequence, so must check if the + # constraints are satisfied for the scenario + @abc.abstractmethod + def check_constraints(self, model_name, sequence_id): + pass + + def __init__( + self, + name, + trials, + rng, + sequence_constraints, + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, trials, verbose, out_stream) + self.rng_ = rng + self.sequence_constraints_ = sequence_constraints + + def get_expected_result(self, expected_result, value, trial, flag_str=None): + # Adjust the expected_result for models that + # could not implement the full accumulator. See + # qa/common/gen_qa_sequence_models.py for more + # information. + if ( + ("nobatch" not in trial and ("custom" not in trial)) + or ("graphdef" in trial) + or ("plan" in trial) + or ("onnx" in trial) + ) or ("libtorch" in trial): + expected_result = value + if (flag_str is not None) and ("start" in flag_str): + expected_result += 1 + return expected_result + + def check_sequence_async( + self, + client_metadata, + trial, + model_name, + input_dtype, + steps, + timeout_ms=DEFAULT_TIMEOUT_MS, + batch_size=1, + sequence_name="", + tensor_shape=(1,), + input_name="INPUT", + output_name="OUTPUT", + ): + """Perform sequence of inferences using async run. The 'steps' holds + a list of tuples, one for each inference with format: + + (flag_str, value, expected_result, delay_ms) + + """ + if ( + ("savedmodel" not in trial) + and ("graphdef" not in trial) + and ("custom" not in trial) + and ("onnx" not in trial) + and ("libtorch" not in trial) + and ("plan" not in trial) + ): + assert False, "unknown trial type: " + trial + + if "nobatch" not in trial: + tensor_shape = (batch_size,) + tensor_shape + if "libtorch" in trial: + input_name = "INPUT__0" + output_name = "OUTPUT__0" + + triton_client = client_metadata[0] + sequence_id = client_metadata[1] + + # Execute the sequence of inference... + seq_start_ms = int(round(time.time() * 1000)) + user_data = SequenceScenario.UserData() + # Ensure there is no running stream + triton_client.stop_stream() + triton_client.start_stream(partial(completion_callback, user_data)) + + sent_count = 0 + for flag_str, value, _, delay_ms in steps: + seq_start = False + seq_end = False + if flag_str is not None: + seq_start = "start" in flag_str + seq_end = "end" in flag_str + + if input_dtype == np.object_: + in0 = np.full(tensor_shape, value, dtype=np.int32) + in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) + in0 = in0n.reshape(tensor_shape) + else: + in0 = np.full(tensor_shape, value, dtype=input_dtype) + + inputs = [ + grpcclient.InferInput( + input_name, tensor_shape, np_to_triton_dtype(input_dtype) + ), + ] + inputs[0].set_data_from_numpy(in0) + + triton_client.async_stream_infer( + model_name, + inputs, + sequence_id=sequence_id, + sequence_start=seq_start, + sequence_end=seq_end, + ) + sent_count += 1 + + if delay_ms is not None: + time.sleep(delay_ms / 1000.0) + + # Process the results in order that they were sent + result = None + processed_count = 0 + while processed_count < sent_count: + (results, error) = user_data._completed_requests.get() + if error is not None: + raise error + + (_, value, expected, _) = steps[processed_count] + processed_count += 1 + if timeout_ms != None: + now_ms = int(round(time.time() * 1000)) + if (now_ms - seq_start_ms) > timeout_ms: + raise TimeoutException( + "Timeout expired for {}, got {} ms".format( + sequence_name, (now_ms - seq_start_ms) + ) + ) + + result = ( + results.as_numpy(output_name)[0] + if "nobatch" in trial + else results.as_numpy(output_name)[0][0] + ) + if self.verbose_: + print( + "{} {}: + {} = {}".format( + sequence_name, sequence_id, value, result + ), + file=self.out_stream_, + ) + + if expected is not None: + if input_dtype == np.object_: + assert ( + int(result) == expected + ), "{}: expected result {}, got {} {} {}".format( + sequence_name, expected, int(result), trial, model_name + ) + else: + assert ( + result == expected + ), "{}: expected result {}, got {} {} {}".format( + sequence_name, expected, result, trial, model_name + ) + triton_client.stop_stream() + return sent_count + + +class SequenceNoEndScenario(SequenceScenario): + def __init__( + self, + name, + trials, + rng, + sequence_constraints, + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream) + + def check_constraints(self, model_name, sequence_id): + # The scenario can always be run regardless of the previous runs + return True + + def run( + self, + client_metadata, + len_mean=SEQUENCE_LENGTH_MEAN, + len_stddev=SEQUENCE_LENGTH_STDEV, + ): + trial = self.get_trial() + dtype = self.get_datatype(trial) + model_name = tu.get_sequence_model_name(trial, dtype) + if not self.check_constraints(model_name, client_metadata[1]): + return None + + # Track that the sequence id of the model is used for no-end sequence + if not model_name in self.sequence_constraints_: + self.sequence_constraints_[model_name] = {} + self.sequence_constraints_[model_name][client_metadata[1]] = True + + # Create a variable length sequence with "start" flag but that + # never ends. The sequence should be aborted by the server and its + # slot reused for another sequence. + seqlen = max(1, int(self.rng_.normal(len_mean, len_stddev))) + print( + "{} {}: no-end seqlen = {}".format(self.name_, client_metadata[1], seqlen), + file=self.out_stream_, + ) + + values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype) + + steps = [] + expected_result = 0 + + for idx, _ in enumerate(range(seqlen)): + flags = "" + if idx == 0: + flags = "start" + + val = values[idx] + delay_ms = None + expected_result += val + expected_result = self.get_expected_result( + expected_result, val, trial, flags + ) + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + return self.check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_ + ) + + +class SequenceValidNoEndScenario(SequenceScenario): + def __init__( + self, + name, + trials, + rng, + sequence_constraints, + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream) + + def check_constraints(self, model_name, sequence_id): + # The scenario can always be run regardless of the previous runs + return True + + def run( + self, + client_metadata, + len_mean=SEQUENCE_LENGTH_MEAN, + len_stddev=SEQUENCE_LENGTH_STDEV, + ): + trial = self.get_trial() + dtype = self.get_datatype(trial) + model_name = tu.get_sequence_model_name(trial, dtype) + if not self.check_constraints(model_name, client_metadata[1]): + return None + + # Track that the sequence id of the model is used for no-end sequence + if not model_name in self.sequence_constraints_: + self.sequence_constraints_[model_name] = {} + self.sequence_constraints_[model_name][client_metadata[1]] = True + + # Create two variable length sequences, the first with "start" and + # "end" flags and the second with no "end" flag, where both + # sequences use the same correlation ID and are sent back-to-back. + seqlen = [ + max(1, int(self.rng_.normal(len_mean, len_stddev))), + max(1, int(self.rng_.normal(len_mean, len_stddev))), + ] + print( + "{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format( + self.name_, client_metadata[1], seqlen[0], seqlen[1] + ), + file=self.out_stream_, + ) + + values = [ + self.rng_.randint(0, 1024 * 1024, size=seqlen[0]).astype(dtype), + self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype), + ] + + for p in [0, 1]: + steps = [] + expected_result = 0 + + for idx, _ in enumerate(range(seqlen[p])): + flags = "" + if idx == 0: + flags += ",start" + if (p == 0) and (idx == (seqlen[p] - 1)): + flags += ",end" + + val = values[p][idx] + delay_ms = None + expected_result += val + expected_result = self.get_expected_result( + expected_result, val, trial, flags + ) + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + return self.check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_ + ) + + +class SequenceValidValidScenario(SequenceScenario): + def __init__( + self, + name, + trials, + rng, + sequence_constraints, + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream) + + def check_constraints(self, model_name, sequence_id): + # The scenario can always be run regardless of the previous runs + return True + + def run( + self, + client_metadata, + len_mean=SEQUENCE_LENGTH_MEAN, + len_stddev=SEQUENCE_LENGTH_STDEV, + ): + trial = self.get_trial() + dtype = self.get_datatype(trial) + model_name = tu.get_sequence_model_name(trial, dtype) + if not self.check_constraints(model_name, client_metadata[1]): + return None + + # Track that the sequence id of the model is used for no-end sequence + if not model_name in self.sequence_constraints_: + self.sequence_constraints_[model_name] = {} + self.sequence_constraints_[model_name][client_metadata[1]] = False + + # Create two variable length sequences with "start" and "end" + # flags, where both sequences use the same correlation ID and are + # sent back-to-back. + seqlen = [ + max(1, int(self.rng_.normal(len_mean, len_stddev))), + max(1, int(self.rng_.normal(len_mean, len_stddev))), + ] + print( + "{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format( + self.name_, client_metadata[1], seqlen[0], seqlen[1] + ), + file=self.out_stream_, + ) + + values = [ + self.rng_.randint(0, 1024 * 1024, size=seqlen[0]).astype(dtype), + self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype), + ] + + for p in [0, 1]: + steps = [] + expected_result = 0 + + for idx, _ in enumerate(range(seqlen[p])): + flags = "" + if idx == 0: + flags += ",start" + if idx == (seqlen[p] - 1): + flags += ",end" + + val = values[p][idx] + delay_ms = None + expected_result += val + expected_result = self.get_expected_result( + expected_result, val, trial, flags + ) + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + return self.check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_ + ) + + +class SequenceNoStartScenario(SequenceScenario): + def __init__( + self, + name, + trials, + rng, + sequence_constraints, + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream) + + def check_constraints(self, model_name, sequence_id): + # no-start cannot follow no-end since the server will + # just assume that the no-start is a continuation of + # the no-end sequence instead of being a sequence + # missing start flag. + if (model_name in self.sequence_constraints_) and ( + sequence_id in self.sequence_constraints_[model_name] + ): + return not self.sequence_constraints_[model_name][sequence_id] + return True + + def run(self, client_metadata): + trial = self.get_trial() + dtype = self.get_datatype(trial) + model_name = tu.get_sequence_model_name(trial, dtype) + if not self.check_constraints(model_name, client_metadata[1]): + return None + + # Track that the sequence id of the model is used for no-end sequence + if not model_name in self.sequence_constraints_: + self.sequence_constraints_[model_name] = {} + self.sequence_constraints_[model_name][client_metadata[1]] = False + + # Create a sequence without a "start" flag. Sequence should get an + # error from the server. + seqlen = 1 + print( + "{} {}: no-start seqlen = {}".format( + self.name_, client_metadata[1], seqlen + ), + file=self.out_stream_, + ) + + values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype) + + steps = [] + + for idx, _ in enumerate(range(seqlen)): + flags = None + val = values[idx] + delay_ms = None + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, None, delay_ms), + ) + + try: + self.check_sequence_async(client_metadata, trial, model_name, dtype, steps) + # Hit this point if sending no-start sequence to sequence id that + # was used for no-end sequence and that means the constraints check + # is inaccurate + assert False, "expected inference failure from missing START flag" + except Exception as ex: + if "must specify the START flag" not in ex.message(): + raise + # Expect no START error as success case + return seqlen + + +class SequenceValidScenario(SequenceScenario): + def __init__( + self, + name, + trials, + rng, + sequence_constraints, + verbose=False, + out_stream=sys.stdout, + ): + super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream) + + def check_constraints(self, model_name, sequence_id): + # The scenario can always be run regardless of the previous runs + return True + + def run( + self, + client_metadata, + len_mean=SEQUENCE_LENGTH_MEAN, + len_stddev=SEQUENCE_LENGTH_STDEV, + ): + trial = self.get_trial() + dtype = self.get_datatype(trial) + model_name = tu.get_sequence_model_name(trial, dtype) + if not self.check_constraints(model_name, client_metadata[1]): + return None + + # Track that the sequence id of the model is used for no-end sequence + if not model_name in self.sequence_constraints_: + self.sequence_constraints_[model_name] = {} + self.sequence_constraints_[model_name][client_metadata[1]] = False + + # Create a variable length sequence with "start" and "end" flags. + seqlen = max(1, int(self.rng_.normal(len_mean, len_stddev))) + print( + "{} {}: valid seqlen = {}".format(self.name_, client_metadata[1], seqlen), + file=self.out_stream_, + ) + + values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype) + + steps = [] + expected_result = 0 + + for idx, _ in enumerate(range(seqlen)): + flags = "" + if idx == 0: + flags += ",start" + if idx == (seqlen - 1): + flags += ",end" + + val = values[idx] + delay_ms = None + expected_result += val + expected_result = self.get_expected_result( + expected_result, val, trial, flags + ) + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + return self.check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_ + ) diff --git a/qa/L0_long_running_stress/stress.py b/qa/L0_long_running_stress/stress.py new file mode 100755 index 0000000000..978f204ee6 --- /dev/null +++ b/qa/L0_long_running_stress/stress.py @@ -0,0 +1,657 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +from scenarios import * + +sys.path.append("../common") + +import argparse +import bisect +import os +import threading +import time +import traceback +from builtins import range, str +from functools import partial + +import numpy as np +import prettytable +import tritonclient.grpc as grpcclient + +FLAGS = None +CORRELATION_ID_BLOCK_SIZE = 1024 * 1024 +BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx plan") + +_thread_exceptions = [] +_thread_exceptions_mutex = threading.Lock() + +# List of scenario that failure doesn't contribute to test fail at the momeent. +# Note that all scenario should not have error but some edge cases are hard to +# track down so the investigation is postponed. +ALLOW_FAILURE_SCENARIO = [ + PerfAnalyzerScenario.__name__, +] + +STOP_STRESS_THREAD = False + + +def get_trials(is_sequence=True): + _trials = () + if is_sequence: + for backend in BACKENDS.split(" "): + if (backend != "libtorch") and (backend != "savedmodel"): + _trials += (backend + "_nobatch",) + _trials += (backend,) + else: + _trials = () + for backend in BACKENDS.split(" "): + if backend != "libtorch": + _trials += (backend + "_nobatch",) + return _trials + + +def update_test_count( + test_case_count, + failed_test_case_count, + request_count, + test_case_name, + success=True, + count=1, +): + if success: + # Count the times each test case runs + if test_case_name in test_case_count: + test_case_count[test_case_name] += 1 + else: + test_case_count[test_case_name] = 1 + + # Count the number of requests were sent for each test case + if test_case_name in request_count: + request_count[test_case_name] += count + else: + request_count[test_case_name] = count + else: + # Count the times each test case fails + if test_case_name in failed_test_case_count: + failed_test_case_count[test_case_name] += 1 + else: + failed_test_case_count[test_case_name] = 1 + + +class ScenarioSelector: + def __init__(self, probs, rng): + self.rng_ = rng + self.probs_range_ = [] + self.scenarios_ = [] + + # probs is a list/dict of scenario weights and types + total_weight = 0 + for weight, scenario in probs: + total_weight += weight + self.scenarios_.append(scenario) + self.probs_range_.append(float(total_weight)) + # Normalize weight + for i in range(len(self.probs_range_)): + self.probs_range_[i] /= total_weight + + def get_scenario(self): + return self.scenarios_[bisect.bisect_left(self.probs_range_, self.rng_.rand())] + + +def stress_thread( + name, + seed, + correlation_id_base, + test_case_count, + failed_test_case_count, + sequence_request_count, +): + # Thread responsible for generating sequences of inference + # requests. + global _thread_exceptions + + # Write any thread output to dedicated file + with open("{}.log".format(name), "w") as out_file: + print("Starting thread {} with seed {}".format(name, seed), file=out_file) + rng = np.random.RandomState(seed) + + # FIXME revisit to check if it is necessary + client_metadata_list = [] + + # Must use streaming GRPC context to ensure each sequences' + # requests are received in order. Create 2 common-use contexts + # with different correlation IDs that are used for most + # inference requests. Also create some rare-use contexts that + # are used to make requests with rarely-used correlation IDs. + # + # Need to remember if the last sequence case runs on each model + # is no-end cases since we don't want some choices to follow others + # since that gives results not expected. See below for details. + common_cnt = 2 + rare_cnt = 8 + is_last_used_no_end = {} + + update_counter_fn = partial( + update_test_count, + test_case_count, + failed_test_case_count, + sequence_request_count, + ) + for c in range(common_cnt + rare_cnt): + client_metadata_list.append( + ( + grpcclient.InferenceServerClient( + "localhost:8001", verbose=FLAGS.verbose + ), + correlation_id_base + c, + ) + ) + pa_start_seq_id = correlation_id_base + common_cnt + rare_cnt + pa_end_seq_id = correlation_id_base + CORRELATION_ID_BLOCK_SIZE + + # Weight roughly in thousandth percent + ss = ScenarioSelector( + [ + ( + 60, + TimeoutScenario( + name, + get_trials(False), + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + (80, ResNetScenario(name, verbose=FLAGS.verbose, out_stream=out_file)), + ( + 60, + CrashingScenario(name, verbose=FLAGS.verbose, out_stream=out_file), + ), + ( + 62, + SequenceNoEndScenario( + name, + get_trials(), + rng, + is_last_used_no_end, + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + ( + 68, + SequenceValidNoEndScenario( + name, + get_trials(), + rng, + is_last_used_no_end, + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + ( + 68, + SequenceValidValidScenario( + name, + get_trials(), + rng, + is_last_used_no_end, + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + ( + 7, + SequenceNoStartScenario( + name, + get_trials(), + rng, + is_last_used_no_end, + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + ( + 295, + SequenceValidScenario( + name, + get_trials(), + rng, + is_last_used_no_end, + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + ( + 300, + PerfAnalyzerScenario( + name, + rng, + get_trials(), + get_trials(False), + sequence_id_range=(pa_start_seq_id, pa_end_seq_id), + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + ], + rng, + ) + + rare_idx = 0 + common_idx = 0 + while not STOP_STRESS_THREAD: + scenario = ss.get_scenario() + # FIXME generating 'is_rare' for now as some scenario uses it to select + # client context, but we may not need this if we roll forward the sequence id + if rng.rand() < 0.1: + client_idx = common_cnt + rare_idx + rare_idx = (rare_idx + 1) % rare_cnt + else: + client_idx = common_idx + common_idx = (common_idx + 1) % common_cnt + + try: + res = scenario.run(client_metadata_list[client_idx]) + if res is not None: + update_counter_fn(scenario.scenario_name(), count=res) + except Exception as ex: + update_counter_fn(scenario.scenario_name(), False) + _thread_exceptions_mutex.acquire() + try: + _thread_exceptions.append( + (name, scenario.scenario_name(), traceback.format_exc()) + ) + finally: + _thread_exceptions_mutex.release() + + # We need to explicitly close each client so that streams get + # cleaned up and closed correctly, otherwise the application + # can hang when exiting. + for c, i in client_metadata_list: + print("thread {} closing client {}".format(name, i), file=out_file) + c.close() + + print("Exiting thread {}".format(name), file=out_file) + + +def load_thread( + name, + seed, + correlation_id_base, + test_case_count, + failed_test_case_count, + sequence_request_count, +): + # Thread responsible for generating sequences of inference + # requests. + global _thread_exceptions + + # Write any thread output to dedicated file + with open("{}.log".format(name), "w") as out_file: + print("Starting thread {} with seed {}".format(name, seed), file=out_file) + rng = np.random.RandomState(seed) + + update_counter_fn = partial( + update_test_count, + test_case_count, + failed_test_case_count, + sequence_request_count, + ) + pa_start_seq_id = correlation_id_base + pa_end_seq_id = correlation_id_base + CORRELATION_ID_BLOCK_SIZE + + # Create PerfAnalyzerScenario with no additional trial, + # the default model 'resnet', more compute intense than the simple + # models, will be the only choice for generating load + ss = ScenarioSelector( + [ + ( + 1, + PerfAnalyzerScenario( + name, + rng, + [], + [], + sequence_id_range=(pa_start_seq_id, pa_end_seq_id), + verbose=FLAGS.verbose, + out_stream=out_file, + ), + ), + ], + rng, + ) + + while not STOP_STRESS_THREAD: + scenario = ss.get_scenario() + try: + res = scenario.run(None) + if res is not None: + update_counter_fn(scenario.scenario_name(), count=res) + except Exception as ex: + update_counter_fn(scenario.scenario_name(), False) + _thread_exceptions_mutex.acquire() + try: + _thread_exceptions.append( + (name, scenario.scenario_name(), traceback.format_exc()) + ) + finally: + _thread_exceptions_mutex.release() + + print("Exiting thread {}".format(name), file=out_file) + + +def format_content(content, max_line_length): + # Accumulated line length + ACC_length = 0 + words = content.split(" ") + formatted_content = "" + + for word in words: + if (ACC_length + (len(word) + 1)) <= max_line_length: + # Append the word and a space + formatted_content = formatted_content + word + " " + ACC_length = ACC_length + len(word) + 1 + else: + # Append a line break, then the word and a space + formatted_content = formatted_content + "\n" + word + " " + # Reset the counter of length + ACC_length = len(word) + 1 + return formatted_content + + +def accumulate_count(dict_list, test_case_name): + count = 0 + for d in dict_list: + if test_case_name in d: + count += d[test_case_name] + + return count + + +def generate_report( + elapsed_time, _test_case_count, _failed_test_case_count, _sequence_request_count +): + hrs = elapsed_time // 3600 + mins = (elapsed_time / 60) % 60 + secs = elapsed_time % 60 + + test_case_description = { + "SequenceValidScenario": 'Send a sequence with "start" and "end" flags.', + "SequenceValidValidScenario": "Send two sequences back to back using the same correlation ID" + ' with "start" and "end" flags.', + "SequenceValidNoEndScenario": "Send two sequences back to back using the same correlation ID." + ' The first with "start" and "end" flags, and the second with no' + ' "end" flag.', + "SequenceNoStartScenario": 'Send a sequence without a "start" flag. Sequence should get an' + " error from the server.", + "SequenceNoEndScenario": 'Send a sequence with "start" flag but that never ends. The' + " sequence should be aborted by the server and its slot reused" + " for another sequence.", + "TimeoutScenario": "Expect an exception for small timeout values.", + "ResNetScenario": "Send a request using resnet model.", + "CrashingScenario": "Client crashes in the middle of inferences.", + "PerfAnalyzerScenario": "Client that maintains a specific load.", + } + + f = open("stress_report.txt", "w") + f.write( + "Test Duration: {:0>2}:{:0>2}:{:0>2} (HH:MM:SS)\n".format( + int(hrs), int(mins), int(secs) + ) + ) + + t = prettytable.PrettyTable(hrules=prettytable.ALL) + t.field_names = [ + "Test Case", + "Number of Failures", + "Test Count", + "Request Count", + "Test Case Description", + ] + + t.align["Test Case"] = "l" + t.align["Number of Failures"] = "l" + t.align["Test Count"] = "l" + t.align["Request Count"] = "l" + t.align["Test Case Description"] = "l" + + acc_test_case_count = {} + acc_failed_test_case_count = {} + acc_sequence_request_count = {} + + for c in test_case_description: + # Accumulate all the individual thread counts + acc_test_case_count[c] = accumulate_count(_test_case_count, c) + acc_failed_test_case_count[c] = accumulate_count(_failed_test_case_count, c) + acc_sequence_request_count[c] = accumulate_count(_sequence_request_count, c) + + description = test_case_description[c] + # Add additional description on scenarios that allow failure + if c in ALLOW_FAILURE_SCENARIO: + description += ( + " Note that this scenario is marked to allow " + "failure due to subtle edge cases that will be " + "investigated in the future. However, only a " + "minimal failure count is expected and we should " + "take action if the number is concerning." + ) + t.add_row( + [ + c, + acc_failed_test_case_count[c] if c in acc_failed_test_case_count else 0, + acc_test_case_count[c] if c in acc_test_case_count else 0, + acc_sequence_request_count[c] if c in acc_sequence_request_count else 0, + format_content(description, 50), + ] + ) + + t.add_row( + [ + "TOTAL", + sum(acc_failed_test_case_count.values()), + sum(acc_test_case_count.values()), + sum(acc_sequence_request_count.values()), + "X", + ] + ) + + print(t) + f.write(str(t)) + + f.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-r", "--random-seed", type=int, required=False, help="Random seed." + ) + parser.add_argument( + "-t", + "--concurrency", + type=int, + required=False, + default=8, + help="Request concurrency. Default is 8.", + ) + parser.add_argument( + "--load-thread", + type=int, + required=False, + default=0, + help="Number of dedicated threads that keep compute " + "device (i.e. GPU/CPUs) under load. The load generated " + 'from "--concurrency" often behaves as request spike, ' + " this argument may be used to produce consistent load " + " to keep devices at high utilization. Default is 0, " + "which means no dedicated load thread will be created.", + ) + parser.add_argument( + "-d", + "--test-duration", + type=int, + required=False, + default=25000, + help="Duration of stress test to run. Default is 25000 seconds " + + "(approximately 7 hours).", + ) + FLAGS = parser.parse_args() + + # Initialize the random seed. For reproducibility each thread + # maintains its own RNG which is initialized based on this seed. + randseed = 0 + if FLAGS.random_seed != None: + randseed = FLAGS.random_seed + else: + randseed = int(time.time()) + np.random.seed(randseed) + + print("random seed = {}".format(randseed)) + print("concurrency = {}".format(FLAGS.concurrency)) + print("test duration = {}".format(FLAGS.test_duration)) + + # Create hashes for each thread for generating report + _test_case_count = [dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread)] + _failed_test_case_count = [ + dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread) + ] + _sequence_request_count = [ + dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread) + ] + + threads = [] + + for idx in range(FLAGS.concurrency): + thread_name = "thread_{}".format(idx) + + # Create the seed for the thread. Since these are created in + # reproducible order off of the initial seed we will get + # reproducible results when given the same seed. + seed = np.random.randint(2**32) + + # Each thread is reserved a block of correlation IDs or size + # CORRELATION_ID_BLOCK_SIZE + correlation_id_base = 1 + (idx * CORRELATION_ID_BLOCK_SIZE) + + threads.append( + threading.Thread( + target=stress_thread, + args=( + thread_name, + seed, + correlation_id_base, + _test_case_count[idx], + _failed_test_case_count[idx], + _sequence_request_count[idx], + ), + ) + ) + + for idx in range(FLAGS.load_thread): + thread_name = "load_thread_{}".format(idx) + + # Create the seed for the thread. Since these are created in + # reproducible order off of the initial seed we will get + # reproducible results when given the same seed. + seed = np.random.randint(2**32) + + # Each thread is reserved a block of correlation IDs or size + # CORRELATION_ID_BLOCK_SIZE + correlation_id_base = 1 + ( + (FLAGS.concurrency + idx) * CORRELATION_ID_BLOCK_SIZE + ) + + threads.append( + threading.Thread( + target=load_thread, + args=( + thread_name, + seed, + correlation_id_base, + _test_case_count[idx], + _failed_test_case_count[idx], + _sequence_request_count[idx], + ), + ) + ) + + exit_code = 0 + + start_time = time.time() + for t in threads: + t.start() + + while (time.time() - start_time) < FLAGS.test_duration: + time.sleep(1) + for t in threads: + # Stop the test early if there is early termination of a thread. + if not t.is_alive(): + exit_code = 1 + break + if exit_code != 0: + break + + STOP_STRESS_THREAD = True + for t in threads: + # Given long timeout to determine if a thread hangs + t.join(timeout=300) + # join() returns due to timeout + if t.is_alive() and (exit_code == 0): + exit_code = 1 + + generate_report( + time.time() - start_time, + _test_case_count, + _failed_test_case_count, + _sequence_request_count, + ) + + _thread_exceptions_mutex.acquire() + try: + if len(_thread_exceptions) > 0: + for thread, scenario, ex in _thread_exceptions: + print("*********\n* {} {}\n{}*********\n".format(thread, scenario, ex)) + if scenario not in ALLOW_FAILURE_SCENARIO: + exit_code = 1 + finally: + _thread_exceptions_mutex.release() + + print( + "Exiting stress test. In the case of failure, please refer to the thread log files for detail" + ) + sys.exit(exit_code) diff --git a/qa/L0_long_running_stress/stress_mail.py b/qa/L0_long_running_stress/stress_mail.py new file mode 100755 index 0000000000..36f347c2ac --- /dev/null +++ b/qa/L0_long_running_stress/stress_mail.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +from datetime import date + +import nightly_email_helper + +CI_JOB_ID = os.environ.get("CI_JOB_ID", "") + +if __name__ == "__main__": + today = date.today().strftime("%Y-%m-%d") + subject = ( + "Triton Long-Running Stress Test " + + ((sys.argv[1] + " ") if len(sys.argv) >= 2 else "") + + "Summary: " + + today + ) + stress_report = "stress_report.txt" + link = "https://gitlab-master.nvidia.com/dl/dgx/tritonserver/-/jobs/" + CI_JOB_ID + write_up = "

The table below includes results from long-running stress test. Please refer to the description of each test case to see what different kinds of inference requests were sent. Request concurrency is set to 8.

" + write_up += ( + "

Please check the CI output webpage for the details of the failures: " + + link + + "

" + ) + html_content = ( + '
'
+        + write_up
+        + '
'
+    )
+    with open(stress_report, "r") as f:
+        html_content += f.read() + "\n"
+    html_content += "
" + nightly_email_helper.send(subject, html_content, is_html=True) diff --git a/qa/L0_long_running_stress/test.sh b/qa/L0_long_running_stress/test.sh new file mode 100755 index 0000000000..b98a89f955 --- /dev/null +++ b/qa/L0_long_running_stress/test.sh @@ -0,0 +1,173 @@ +#!/bin/bash +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +STRESS_TEST=stress.py + +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +# If the test should be run in long and high load setting +if [ "$TRITON_PERF_LONG" == 1 ]; then + # ~ 6.5 days + TEST_DURATION=480000 + LOAD_THREAD_COUNT=2 + EMAIL_SUBJECT="Long" +else + # ~ 7 hours + TEST_DURATION=25000 + LOAD_THREAD_COUNT=0 + EMAIL_SUBJECT="" +fi + +RET=0 + +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch"} +export BACKENDS + +export CI_JOB_ID=${CI_JOB_ID} + +MODEL_DIR=models + +rm -fr *.log *.txt models validation_data csv_dir && mkdir models validation_data csv_dir + +# Get the datatype to use based on the backend +function get_datatype () { + local dtype='int32' + if [[ $1 == "plan" ]] || [[ $1 == "savedmodel" ]]; then + dtype='float32' + elif [[ $1 == "graphdef" ]]; then + dtype='object' + fi + echo $dtype +} + +# Setup model repository - two instances with batch-size 2 +MODELS="" +for BACKEND in $BACKENDS; do + DTYPE=$(get_datatype $BACKEND) + MODELS="$MODELS $DATADIR/qa_sequence_model_repository/${BACKEND}_sequence_${DTYPE}" +done + +for MODEL in $MODELS; do + cp -r $MODEL $MODEL_DIR/. && \ + (cd $MODEL_DIR/$(basename $MODEL) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 2/" config.pbtxt && \ + sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 7000000/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 2/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 2/" config.pbtxt) +done + +MODELS="" +for BACKEND in $BACKENDS; do + DTYPE=$(get_datatype $BACKEND) + MODELS="$MODELS $DATADIR/qa_sequence_model_repository/${BACKEND}_nobatch_sequence_${DTYPE}" +done + +for MODEL in $MODELS; do + cp -r $MODEL $MODEL_DIR/. && \ + (cd $MODEL_DIR/$(basename $MODEL) && \ + sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 7000000/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 2/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 2/" config.pbtxt) +done + +MODELS="" +for BACKEND in $BACKENDS; do + MODELS="$MODELS $DATADIR/qa_identity_model_repository/${BACKEND}_nobatch_zero_1_float32" +done + +for MODEL in $MODELS; do + cp -r $MODEL $MODEL_DIR/. && \ + (cd $MODEL_DIR/$(basename $MODEL) && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"1000\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) +done +cp -r ../custom_models/custom_zero_1_float32 $MODEL_DIR/custom_zero_1_float32 && \ + mkdir $MODEL_DIR/custom_zero_1_float32/1 && \ + (cd $MODEL_DIR/custom_zero_1_float32 && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"10000\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +cp -r $DATADIR/tf_model_store/resnet_v1_50_graphdef $MODEL_DIR/resnet_v1_50_graphdef_def && \ + (cd $MODEL_DIR/resnet_v1_50_graphdef_def && \ + sed -i 's/^name: "resnet_v1_50_graphdef"/name: "resnet_v1_50_graphdef_def"/' config.pbtxt && \ + echo "optimization { }" >> config.pbtxt) + +SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR" +SERVER_LOG="./server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $STRESS_TEST -d ${TEST_DURATION} --load-thread ${LOAD_THREAD_COUNT} >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +# Run only if both TRITON_FROM and TRITON_TO_DL are set +if [[ ! -z "$TRITON_FROM" ]] && [[ ! -z "$TRITON_TO_DL" ]]; then + python stress_mail.py "$EMAIL_SUBJECT" +fi + +exit $RET diff --git a/qa/L0_memory/test.sh b/qa/L0_memory/test.sh new file mode 100755 index 0000000000..e7c08d9453 --- /dev/null +++ b/qa/L0_memory/test.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TEST_LOG="./memory_test.log" +MEMORY_TEST=./memory_test +PINNED_MEMORY_MANAGER_TEST=./pinned_memory_manager_test + +RET=0 + +# Must run on multiple devices +export CUDA_VISIBLE_DEVICES=0,1 + +rm -f TEST_LOG + +set +e +$MEMORY_TEST >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Memory Test Failed\n***" + RET=1 +fi +set -e + +set +e +$PINNED_MEMORY_MANAGER_TEST >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Pinned Memory Manager Test Failed\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_memory_growth/busy_op_test.py b/qa/L0_memory_growth/busy_op_test.py new file mode 100755 index 0000000000..b7916090fa --- /dev/null +++ b/qa/L0_memory_growth/busy_op_test.py @@ -0,0 +1,99 @@ +#!/usr/bin/python + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +from builtins import range + +import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.") + parser.add_argument( + "-n", + "--num-requests", + type=int, + required=True, + help="Number of asynchronous requests to launch.", + ) + + FLAGS = parser.parse_args() + + # Run the busyop model which takes a delay as input. + model_name = FLAGS.model + + # Create the inference context for the model. Need to set the concurrency + # based on the number of requests so that the delivery of the async + # requests is not blocked. + # See the comment for more details: https://github.com/triton-inference-server/client/blob/r24.02/src/python/library/tritonclient/http/_client.py#L1501 + client = httpclient.InferenceServerClient( + FLAGS.url, verbose=FLAGS.verbose, concurrency=FLAGS.num_requests + ) + + # Collect async requests here + requests = [] + + # Create the data for the input tensor. Creating tensor size with 5 MB. + tensor_size = [1, 5 * 1024 * 1024] + input_data = np.random.randn(*tensor_size).astype(np.float32) + + inputs = [ + httpclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + + # Send requests + for i in range(FLAGS.num_requests): + requests.append(client.async_infer(model_name, inputs)) + print("Sent request %d" % i, flush=True) + # wait for requests to finish + for i in range(len(requests)): + requests[i].get_result() + print("Received result %d" % i, flush=True) diff --git a/qa/L0_memory_growth/server_memory_mail.py b/qa/L0_memory_growth/server_memory_mail.py new file mode 100755 index 0000000000..d1307d97a6 --- /dev/null +++ b/qa/L0_memory_growth/server_memory_mail.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import glob +from datetime import date + +import nightly_email_helper + +if __name__ == "__main__": + today = date.today().strftime("%Y-%m-%d") + subject = "Triton Server Memory Growth " + sys.argv[1] + " Summary: " + today + memory_graphs_resnet = glob.glob("memory_growth_resnet*.log") + memory_graphs_busyop = glob.glob("memory_growth_busyop.log") + write_up = "

This test uses perf_analyzer as clients running on 4 different models. The max allowed difference between mean and maximum memory usage is set to 150MB.

" + write_up += "

• What to look for
A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.

" + html_content = ( + '
'
+        + write_up
+        + '
'
+    )
+    for mem_graph in sorted(memory_graphs_resnet):
+        html_content += "\n" + mem_graph + "\n"
+        with open(mem_graph, "r") as f:
+            html_content += f.read() + "\n"
+
+    html_content += "

The busyop test is by design to show that actual memory growth is correctly detected and displayed.

" + + # When we see PTX failures in CI, the busyop memory graph is not created. + if len(memory_graphs_busyop): + write_up = "

• What to look for
The memory usage should increase continually over time, and a linear growth should be observed in the graph below.

" + html_content += ( + '
'
+            + write_up
+            + '
'
+        )
+        for mem_graph in sorted(memory_graphs_busyop):
+            html_content += "\n" + mem_graph + "\n"
+            with open(mem_graph, "r") as f:
+                html_content += f.read() + "\n"
+    else:
+        html_content += (
+            "

The busyop model caused PTX failures when running the CI.

" + ) + html_content += "
" + nightly_email_helper.send(subject, html_content, is_html=True) diff --git a/qa/L0_memory_growth/test.sh b/qa/L0_memory_growth/test.sh new file mode 100755 index 0000000000..721f4f7b35 --- /dev/null +++ b/qa/L0_memory_growth/test.sh @@ -0,0 +1,311 @@ +#!/bin/bash +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# Single GPU +export CUDA_VISIBLE_DEVICES=0 + +# Clients +PERF_ANALYZER=../clients/perf_analyzer +IMAGE=../images/vulture.jpeg + +# Models +TRTEXEC=/usr/src/tensorrt/bin/trtexec +DATADIR=/data/inferenceserver/${REPO_VERSION} + +# Server +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_TIMEOUT=1200 + +# Valgrind massif +LEAKCHECK=/usr/bin/valgrind +LEAKCHECK_ARGS_BASE="--tool=massif --time-unit=B" +MASSIF_TEST=../common/check_massif_log.py + +source ../common/util.sh + +# Function that checks the massif logs +function check_massif_log () { + local massif_out=$1 +} + +rm -rf *.log models/ *.massif + +# Test parameters +STATIC_BATCH=128 +INSTANCE_CNT=2 +CONCURRENCY=20 +CLIENT_BS=8 + +# Set the number of repetitions in nightly and weekly tests +# Set the email subject for nightly and weekly tests +if [ "$TRITON_PERF_WEEKLY" == 1 ]; then + if [ "$TRITON_PERF_LONG" == 1 ]; then + # ~ 2.5 days for system under test + REPETITION=1400 + EMAIL_SUBJECT="Weekly Long" + else + # Run the test for each model approximately 1.5 hours + # All tests are run cumulatively for 7 hours + REPETITION=200 + EMAIL_SUBJECT="Weekly" + fi +else + REPETITION=10 + EMAIL_SUBJECT="Nightly" +fi + +# Threshold memory growth in MB +# NOTES: +# - Bounded memory growth tests typically show < 70 MB usage +# - Plan/ONNX is typically between 20-40 MB +# - Savedmodel is closer to 50-70 MB +# - Unbounded memory growth test typically shows > 100 MB usage +export MAX_ALLOWED_ALLOC="100" + +# Create local model repository +mkdir -p models/ +cp -r $DATADIR/perf_model_store/resnet50* models/ + +# Create the TensorRT plan from ONNX model +rm -fr models/resnet50_fp32_plan && mkdir -p models/resnet50_fp32_plan/1 && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/1/model.onnx models/resnet50_fp32_plan/ && \ +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/labels.txt models/resnet50_fp32_plan/ + +set +e +# Build TRT engine +$TRTEXEC --onnx=models/resnet50_fp32_plan/model.onnx --saveEngine=models/resnet50_fp32_plan/1/model.plan \ + --minShapes=input:1x3x224x224 --optShapes=input:${STATIC_BATCH}x3x224x224 \ + --maxShapes=input:${STATIC_BATCH}x3x224x224 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to generate resnet50 PLAN\n***" + exit 1 +fi + +set -e + +rm models/resnet50_fp32_plan/model.onnx +cp $DATADIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/config.pbtxt models/resnet50_fp32_plan/ && \ +sed -i "s/^name: .*/name: \"resnet50_fp32_plan\"/g" models/resnet50_fp32_plan/config.pbtxt && \ +sed -i 's/^platform: .*/platform: "tensorrt_plan"/g' models/resnet50_fp32_plan/config.pbtxt + +RET=0 + +for MODEL in $(ls models); do + # Skip the resnet50_fp32_libtorch model as it is running into `misaligned address' + # Tracked here: https://nvbugs/3954104 + if [ "$MODEL" == "resnet50_fp32_libtorch" ]; then + continue + fi + + # Create temporary model repository and copy only the model being tested + rm -rf test_repo && mkdir test_repo + cp -r models/$MODEL test_repo/ + + # Set server, client and valgrind arguments + SERVER_ARGS="--model-repository=`pwd`/test_repo" + LEAKCHECK_LOG="test_${MODEL}.valgrind.log" + MASSIF_LOG="test_${MODEL}.massif" + GRAPH_LOG="memory_growth_${MODEL}.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --massif-out-file=$MASSIF_LOG --max-threads=3000 --log-file=$LEAKCHECK_LOG" + SERVER_LOG="test_$MODEL.server.log" + CLIENT_LOG="test_$MODEL.client.log" + + # Enable dynamic batching, set max batch size and instance count + if [ "$MODEL" == "resnet50_fp32_libtorch" ]; then + sed -i "s/^max_batch_size:.*/max_batch_size: 32/" test_repo/$MODEL/config.pbtxt + else + sed -i "s/^max_batch_size:.*/max_batch_size: ${STATIC_BATCH}/" test_repo/$MODEL/config.pbtxt + fi + echo "dynamic_batching {}" >> test_repo/$MODEL/config.pbtxt + echo "instance_group [{ count: ${INSTANCE_CNT} }]" >> test_repo/$MODEL/config.pbtxt + + # Run the server + run_server_leakcheck + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + TEMP_CLIENT_LOG=temp_client.log + TEMP_RET=0 + + SECONDS=0 + # Run the perf analyzer 'REPETITION' times + for ((i=1; i<=$REPETITION; i++)); do + # [TMA-621] Use --no-stability mode in perf analyzer when available + $PERF_ANALYZER -v -m $MODEL -i grpc --concurrency-range $CONCURRENCY -b $CLIENT_BS -p 10000 > $TEMP_CLIENT_LOG 2>&1 + PA_RET=$? + # Success + if [ ${PA_RET} -eq 0 ]; then + continue + # Unstable measurement: OK for this test + elif [ ${PA_RET} -eq 2 ]; then + continue + # Other failures unexpected, report error + else + cat $TEMP_CLIENT_LOG >> $CLIENT_LOG + echo -e "\n***\n*** perf_analyzer for $MODEL failed on iteration $i\n***" >> $CLIENT_LOG + RET=1 + fi + done + TEST_DURATION=$SECONDS + + set -e + + # Stop Server + kill $SERVER_PID + wait $SERVER_PID + + set +e + + # Log test duration and the graph for memory growth + hrs=$(printf "%02d" $((TEST_DURATION / 3600))) + mins=$(printf "%02d" $(((TEST_DURATION / 60) % 60))) + secs=$(printf "%02d" $((TEST_DURATION % 60))) + echo -e "Test Duration: $hrs:$mins:$secs (HH:MM:SS)" >> ${GRAPH_LOG} + ms_print ${MASSIF_LOG} | head -n35 >> ${GRAPH_LOG} + cat ${GRAPH_LOG} + # Check the massif output + python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test for $MODEL Failed.\n***" + RET=1 + fi + # Always output memory usage for easier triage of MAX_ALLOWED_ALLOC settings in the future + grep -i "Change in memory allocation" "${CLIENT_LOG}" || true + set -e +done + +# Next perform a test that has unbound memory growth. Use the busy op Python model +# with a sleep function in order to force requests to sit in the queue, and result +# in memory growth. +BUSY_OP_TEST=busy_op_test.py +NUM_REQUESTS=100 + +rm -rf test_repo && mkdir test_repo +mkdir -p test_repo/busy_op/1/ +cp ../python_models/busy_op/model.py test_repo/busy_op/1/ +cp ../python_models/busy_op/config.pbtxt test_repo/busy_op + +SERVER_ARGS="--model-repository=`pwd`/test_repo" + +LEAKCHECK_LOG="test_busyop.valgrind.log" +MASSIF_LOG="test_busyop.massif" +GRAPH_LOG="memory_growth_busyop.log" +LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --massif-out-file=$MASSIF_LOG --max-threads=3000 --log-file=$LEAKCHECK_LOG" +SERVER_LOG="test_busyop.server.log" +CLIENT_LOG="test_busyop.client.log" +SKIP_BUSYOP=0 + +# Run server +run_server_leakcheck +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + if [ `grep -c "provided PTX was compiled" $SERVER_LOG` != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER due to PTX issue\n***" + SKIP_BUSYOP=1 + else + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 + fi +fi + +set +e + +# Run the busy_op test if no PTX issue was observed when launching server +if [ $SKIP_BUSYOP -ne 1 ]; then + SECONDS=0 + python $BUSY_OP_TEST -v -m busy_op -n $NUM_REQUESTS > $CLIENT_LOG 2>&1 + TEST_RETCODE=$? + TEST_DURATION=$SECONDS + if [ ${TEST_RETCODE} -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** busy_op_test.py Failed\n***" + RET=1 + fi + set -e + + # Stop Server + kill $SERVER_PID + wait $SERVER_PID + + set +e + + # Log test duration and the graph for memory growth + hrs=$(printf "%02d" $((TEST_DURATION / 3600))) + mins=$(printf "%02d" $(((TEST_DURATION / 60) % 60))) + secs=$(printf "%02d" $((TEST_DURATION % 60))) + echo -e "Test Duration: $hrs:$mins:$secs (HH:MM:SS)" >> ${GRAPH_LOG} + ms_print ${MASSIF_LOG} | head -n35 >> ${GRAPH_LOG} + cat ${GRAPH_LOG} + # Check the massif output + python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1 + # This busyop test is expected to return a non-zero error since it is + # intentionally testing unbounded growth. If it returns success for some + # reason, raise error. + if [ $? -ne 1 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Massif test for graphdef_busyop Failed\n***" + echo -e "\n***\n*** Expected unbounded growth, but found acceptable growth within ${MAX_ALLOWED_ALLOC} MB\n***" + RET=1 + fi + # Always output memory usage for easier triage of MAX_ALLOWED_ALLOC settings in the future + grep -i "Change in memory allocation" "${CLIENT_LOG}" || true +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +# Run only if both TRITON_FROM and TRITON_TO_DL are set +if [[ ! -z "$TRITON_FROM" ]] && [[ ! -z "$TRITON_TO_DL" ]]; then + python server_memory_mail.py "$EMAIL_SUBJECT" +fi + +exit $RET diff --git a/qa/L0_metrics/cpu_metrics_test.py b/qa/L0_metrics/cpu_metrics_test.py new file mode 100755 index 0000000000..9a34891557 --- /dev/null +++ b/qa/L0_metrics/cpu_metrics_test.py @@ -0,0 +1,187 @@ +#!/usr/bin/python +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import threading +import time +import unittest +from collections import defaultdict + +import numpy as np +import requests +import tritonclient.http as httpclient + +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") +CPU_UTILIZATION = "nv_cpu_utilization" +CPU_USED_MEMORY = "nv_cpu_memory_used_bytes" +CPU_TOTAL_MEMORY = "nv_cpu_memory_total_bytes" + + +def get_metrics(): + utilization_pattern = re.compile(rf"{CPU_UTILIZATION} (\d+\.?\d*)") + used_bytes_pattern = re.compile(rf"{CPU_USED_MEMORY} (\d+)") + total_bytes_pattern = re.compile(rf"{CPU_TOTAL_MEMORY} (\d+)") + + r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics") + r.raise_for_status() + + utilization_match = utilization_pattern.search(r.text) + utilization_value = float(utilization_match.group(1)) + + used_bytes_match = used_bytes_pattern.search(r.text) + used_bytes_value = int(used_bytes_match.group(1)) + + total_bytes_match = total_bytes_pattern.search(r.text) + total_bytes_value = int(total_bytes_match.group(1)) + + return utilization_value, used_bytes_value, total_bytes_value + + +class TestCpuMetrics(unittest.TestCase): + def setUp(self): + self.inference_completed = threading.Event() + + shape = [1, 16] + self.model_name = "libtorch_float32_float32_float32" + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + + self.inputs = [ + httpclient.InferInput( + "INPUT0", input0_data.shape, "FP32" + ).set_data_from_numpy(input0_data), + httpclient.InferInput( + "INPUT1", input1_data.shape, "FP32" + ).set_data_from_numpy(input1_data), + ] + + def _validate_metric_variance(self, observed_metrics: dict): + dupe_value_tolerance = 5 + for metric in [CPU_UTILIZATION, CPU_USED_MEMORY]: + observed_values = observed_metrics[metric] + observed_count = len(observed_values) + print( + f"Observed {metric} count: {observed_count}, values: {observed_values}" + ) + + # Must have at least 1 more than the duplicate tolerance + self.assertGreater( + observed_count, + dupe_value_tolerance, + f"Found too many sequential duplicate values for {metric}. Double check the server-side --metrics-interval and observation interval in this test, or consider tuning the duplicate tolerance.", + ) + + # Don't allow observed metric values to be repeated sequentially + # more than a certain tolerance. The expectation is that these metrics + # will vary while the server is processing requests in the background, + # provided the server was configured with a small metrics update interval. + sequential_dupes = 0 + max_sequential_dupes = 0 + prev_value = observed_values[0] + for value in observed_values[1:]: + if value == prev_value: + sequential_dupes += 1 + else: + # If unique value found, reset counter + sequential_dupes = 0 + + # For future observability on dupe frequency to tune the tolerance + if sequential_dupes > max_sequential_dupes: + max_sequential_dupes = sequential_dupes + + self.assertLess(sequential_dupes, dupe_value_tolerance) + prev_value = value + + print( + f"Max sequential duplicate values found for {metric}: {max_sequential_dupes}" + ) + + def _collect_metrics(self, observed_metrics, interval_secs=1): + """ + Collects metrics at provided 'interval_secs' and stores them in the + provided 'observed_metrics' dictionary for postprocessing. + """ + # Give the test and server some time to begin processing requests + # before beginning observation loop. + time.sleep(1) + + while not self.inference_completed.is_set(): + util_value, used_memory_value, _ = get_metrics() + observed_metrics[CPU_UTILIZATION].append(util_value) + observed_metrics[CPU_USED_MEMORY].append(used_memory_value) + time.sleep(interval_secs) + + def test_cpu_metrics_during_inference(self): + with httpclient.InferenceServerClient( + url=f"{_tritonserver_ipaddr}:8000", concurrency=10 + ) as client: + # Start a thread to collect metrics asynchronously while inferences are + # executing, store them in a dictionary for postprocessing validation. + observed_metrics = defaultdict(list) + metrics_thread = threading.Thread( + target=self._collect_metrics, args=(observed_metrics,) + ) + metrics_thread.start() + + # Fire off many asynchronous inference requests to keep server + # busy while monitoring the CPU metrics. Ideal target is about + # 20-30 seconds of inference to get a good number of metric samples. + async_requests = [] + for _ in range(2000): + async_requests.append( + client.async_infer( + model_name=self.model_name, + inputs=self.inputs, + ) + ) + + # Wait for all inference requests to complete + for async_request in async_requests: + async_request.get_result() + + # Set the event to indicate that inference is completed + self.inference_completed.set() + + # Wait for the metrics thread to complete + metrics_thread.join() + + self._validate_metric_variance(observed_metrics) + + def test_cpu_metrics_ranges(self): + # Test some simple sanity checks on the expected ranges of values + # for the CPU related metrics. + utilization, used_memory, total_memory = get_metrics() + self.assertTrue(0 <= utilization <= 1.0) + self.assertTrue(0 <= used_memory <= total_memory) + # NOTE: Can be improved in future to compare upper bound against psutil + # system memory if we introduce the dependency into the test/container. + self.assertGreater(total_memory, 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_metrics/ensemble_delay/config.pbtxt b/qa/L0_metrics/ensemble_delay/config.pbtxt new file mode 100644 index 0000000000..0eaa2f76f7 --- /dev/null +++ b/qa/L0_metrics/ensemble_delay/config.pbtxt @@ -0,0 +1,67 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 4 + +input [ + { + name: "ENSEMBLE_INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "ENSEMBLE_OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + }, + { + name: "ENSEMBLE_OUTPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +ensemble_scheduling +{ + step [ + { + model_name: "dynamic_composing" + model_version: -1 + input_map { key: "INPUT0", value: "ENSEMBLE_INPUT0" } + output_map { key: "OUTPUT0", value: "ENSEMBLE_OUTPUT0" } + }, + { + model_name: "default_composing" + model_version: -1 + input_map { key: "INPUT0", value: "ENSEMBLE_INPUT0" } + output_map { key: "OUTPUT0", value: "ENSEMBLE_OUTPUT1" } + } + ] +} diff --git a/qa/L0_metrics/identity_delay/config.pbtxt b/qa/L0_metrics/identity_delay/config.pbtxt new file mode 100644 index 0000000000..1062868c2b --- /dev/null +++ b/qa/L0_metrics/identity_delay/config.pbtxt @@ -0,0 +1,58 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "identity" +max_batch_size: 4 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] + +parameters [ + { + key: "execute_delay_ms" + value: { string_value: "2000" } + } +] diff --git a/qa/L0_metrics/metrics_config_test.py b/qa/L0_metrics/metrics_config_test.py new file mode 100755 index 0000000000..9153366c04 --- /dev/null +++ b/qa/L0_metrics/metrics_config_test.py @@ -0,0 +1,159 @@ +#!/usr/bin/python +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../common") + +import unittest + +import requests +import test_util as tu + +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + +INF_COUNTER_PATTERNS = [ + "nv_inference_request_duration", + "nv_inference_queue_duration", + "nv_inference_compute_input_duration", + "nv_inference_compute_infer_duration", + "nv_inference_compute_output_duration", +] +INF_SUMMARY_PATTERNS = [ + "nv_inference_request_summary", + "nv_inference_queue_summary", + "nv_inference_compute_input_summary", + "nv_inference_compute_infer_summary", + "nv_inference_compute_output_summary", +] +CACHE_COUNTER_PATTERNS = [ + "nv_cache_num_hits_per_model", + "nv_cache_num_misses_per_model", + "nv_cache_hit_duration_per_model", + "nv_cache_miss_duration_per_model", +] +PINNED_MEMORY_PATTERNS = [ + "nv_pinned_memory_pool_total_bytes", + "nv_pinned_memory_pool_used_bytes", +] +CACHE_SUMMARY_PATTERNS = ["nv_cache_hit_summary", "nv_cache_miss_summary"] + + +class MetricsConfigTest(tu.TestResultCollector): + def _get_metrics(self): + metrics_url = f"http://{_tritonserver_ipaddr}:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def test_pinned_memory_metrics_exist(self): + metrics = self._get_metrics() + for metric in PINNED_MEMORY_PATTERNS: + self.assertIn(metric, metrics) + + # Counters + def test_inf_counters_exist(self): + metrics = self._get_metrics() + for metric in INF_COUNTER_PATTERNS: + self.assertIn(metric, metrics) + + def test_inf_counters_missing(self): + metrics = self._get_metrics() + for metric in INF_COUNTER_PATTERNS: + self.assertNotIn(metric, metrics) + + def test_cache_counters_exist(self): + metrics = self._get_metrics() + for metric in CACHE_COUNTER_PATTERNS: + self.assertIn(metric, metrics) + + def test_cache_counters_missing(self): + metrics = self._get_metrics() + for metric in CACHE_COUNTER_PATTERNS: + self.assertNotIn(metric, metrics) + + # Summaries + def test_inf_summaries_exist(self): + metrics = self._get_metrics() + for metric in INF_SUMMARY_PATTERNS: + self.assertIn(metric, metrics) + + def test_inf_summaries_missing(self): + metrics = self._get_metrics() + for metric in INF_SUMMARY_PATTERNS: + self.assertNotIn(metric, metrics) + + def test_cache_summaries_exist(self): + metrics = self._get_metrics() + for metric in CACHE_SUMMARY_PATTERNS: + self.assertIn(metric, metrics) + + def test_cache_summaries_missing(self): + metrics = self._get_metrics() + for metric in CACHE_SUMMARY_PATTERNS: + self.assertNotIn(metric, metrics) + + def test_summaries_custom_quantiles(self): + metrics = self._get_metrics() + # This env var should be set by test.sh or caller + quantile_pairs = os.environ.get("SUMMARY_QUANTILES", None) + self.assertIsNotNone(quantile_pairs) + + quantiles = [pair.split(":")[0] for pair in quantile_pairs.split(",")] + print(metrics) + for quantile in quantiles: + print(quantile) + self.assertIn(f'quantile="{quantile}"', metrics) + + # DLIS-4762: Disable request summary when caching enabled for now + def test_inf_summaries_exist_with_cache(self): + metrics = self._get_metrics() + bad_patterns = ["nv_inference_request_summary"] + ok_patterns = list(set(INF_SUMMARY_PATTERNS) - set(bad_patterns)) + for metric in ok_patterns: + self.assertIn(metric, metrics) + for metric in bad_patterns: + self.assertNotIn(metric, metrics) + + def test_model_namespacing_label_with_namespace_on(self): + metrics = self._get_metrics() + expected_namespaces = [ + "/opt/tritonserver/qa/L0_metrics/model_namespacing_repos/addsub_repo", + "/opt/tritonserver/qa/L0_metrics/model_namespacing_repos/subadd_repo", + ] + for namespace in expected_namespaces: + label = 'namespace="' + namespace + '"' + self.assertIn(label, metrics) + + def test_model_namespacing_label_with_namespace_off(self): + metrics = self._get_metrics() + self.assertNotIn('namespace="', metrics) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_metrics/metrics_queue_size_test.py b/qa/L0_metrics/metrics_queue_size_test.py new file mode 100755 index 0000000000..40203d5fbc --- /dev/null +++ b/qa/L0_metrics/metrics_queue_size_test.py @@ -0,0 +1,309 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../common") + +import math +import time +import unittest +from functools import partial + +import numpy as np +import requests +import test_util as tu +import tritonclient.http +from tritonclient.utils import triton_to_np_dtype + +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + +QUEUE_METRIC_TEMPLATE = ( + 'nv_inference_pending_request_count{{model="{model_name}",version="1"}}' +) +INFER_METRIC_TEMPLATE = 'nv_inference_count{{model="{model_name}",version="1"}}' +EXEC_METRIC_TEMPLATE = 'nv_inference_exec_count{{model="{model_name}",version="1"}}' + + +class MetricsPendingRequestCountTest(tu.TestResultCollector): + def setUp(self): + self.metrics = None + self.metrics_url = f"http://{_tritonserver_ipaddr}:8002/metrics" + self.server_url = f"{_tritonserver_ipaddr}:8000" + + # Used to verify model config is set to expected values + self.max_batch_size = 4 + self.delay_ms = 2000 + self.delay_sec = self.delay_ms // 1000 + + # Setup dummy inputs + dtype = "FP32" + shape = (1, 1) + input_np = np.ones(shape, dtype=triton_to_np_dtype(dtype)) + self.inputs = [ + tritonclient.http.InferInput("INPUT0", shape, dtype).set_data_from_numpy( + input_np + ) + ] + self.ensemble_inputs = [ + tritonclient.http.InferInput( + "ENSEMBLE_INPUT0", shape, dtype + ).set_data_from_numpy(input_np) + ] + + # Verify values for filling request queues + self.num_requests = 10 + self.concurrency = 10 + # Concurrency must be at least as high as number of async requests we intend + # to send N requests to fill request queues before blocking on any results. + self.assertGreaterEqual(self.concurrency, self.num_requests) + self.client = tritonclient.http.InferenceServerClient( + url=self.server_url, concurrency=self.concurrency + ) + + # Test specific configurations + self.max_queue_size = 0 + + def _validate_model_config(self, model_name, max_queue_size=0): + config = self.client.get_model_config(model_name) + print(config) + params = config.get("parameters", {}) + delay_ms = int(params.get("execute_delay_ms", {}).get("string_value")) + max_batch_size = config.get("max_batch_size") + self.assertEqual(delay_ms, self.delay_ms) + self.assertEqual(max_batch_size, self.max_batch_size) + + dynamic_batching = config.get("dynamic_batching", {}) + default_queue_policy = dynamic_batching.get("default_queue_policy", {}) + self.max_queue_size = default_queue_policy.get("max_queue_size", 0) + + self.assertEqual(self.max_queue_size, max_queue_size) + + return config + + def _get_metrics(self): + r = requests.get(self.metrics_url) + r.raise_for_status() + return r.text + + def _get_metric_line(self, metric, metrics): + for line in metrics.splitlines(): + if metric in line: + return line + return None + + def _get_metric_value(self, metric): + metrics = self._get_metrics() + self.assertIn(metric, metrics) + line = self._get_metric_line(metric, metrics) + print(line) + if not line: + return None + value = line.split()[1] + return float(value) + + def _assert_metric_equals(self, metric, expected_value): + value = self._get_metric_value(metric) + self.assertEqual(value, expected_value) + + def _assert_metric_greater_than(self, metric, gt_value): + value = self._get_metric_value(metric) + self.assertGreater(value, gt_value) + + def _send_async_requests(self, model_name, inputs, futures): + for _ in range(self.num_requests): + futures.append(self.client.async_infer(model_name, inputs)) + + def _send_async_requests_sequence(self, num_seq_slots, model_name, inputs, futures): + started_seqs = {} + num_sent = 0 + while num_sent < self.num_requests: + # Add requests to each sequence slot round-robin, seq_id must be > 0 + # We don't care about finishing any sequences, just need to queue up + # requests for each sequence until num_requests is hit. + seq_id = (num_sent % num_seq_slots) + 1 + # Toggle start flag to False after first request per sequence ID + start = True if seq_id not in started_seqs else False + started_seqs[seq_id] = True + futures.append( + self.client.async_infer( + model_name, + inputs, + request_id=str(num_sent), + sequence_id=seq_id, + sequence_start=start, + ) + ) + num_sent += 1 + + def _test_helper( + self, model_name, batch_size, send_requests_func, max_queue_size=0 + ): + self._validate_model_config(model_name, max_queue_size=max_queue_size) + + queue_size = QUEUE_METRIC_TEMPLATE.format(model_name=model_name) + infer_count = INFER_METRIC_TEMPLATE.format(model_name=model_name) + exec_count = EXEC_METRIC_TEMPLATE.format(model_name=model_name) + # Metric should be zero before sending any requests + self._assert_metric_equals(queue_size, 0) + # Send N requests, letting scheduler delay queue fill up when applicable + futures = [] + send_requests_func(model_name, self.inputs, futures) + # Give Triton a second to load all requests into queues + time.sleep(1) + + # Start from (num_requests-batch_size) because 1 batch should be executing, + # and the rest of the requests should be queued. + # If max_queue_size is specified then the queued requests would be capped + # at max_queue_size. + if max_queue_size != 0: + self._assert_metric_equals(queue_size, max_queue_size) + starting_queue_size = max_queue_size + else: + starting_queue_size = self.num_requests - batch_size + + for expected_queue_size in range(starting_queue_size, 0, -1 * batch_size): + self._assert_metric_equals(queue_size, expected_queue_size) + time.sleep(self.delay_sec) + # Queue should be empty now + self._assert_metric_equals(queue_size, 0) + # Let final batch finish + time.sleep(self.delay_sec) + + # All requests should've been executed without any batching + expected_infer_count = starting_queue_size + batch_size + self._assert_metric_equals(infer_count, expected_infer_count) + expected_exec_count = math.ceil(expected_infer_count / batch_size) + self._assert_metric_equals(exec_count, expected_exec_count) + + failed_count = 0 + for future in futures: + try: + future.get_result() + except Exception as e: + failed_count = failed_count + 1 + + self.assertEqual( + failed_count, self.num_requests - batch_size - starting_queue_size + ) + + def test_default_scheduler(self): + model_name = "default" + # Default scheduler won't do any batching + batch_size = 1 + self._test_helper(model_name, batch_size, self._send_async_requests) + + def test_dynamic_batch_scheduler(self): + model_name = "dynamic" + # With sufficient queue delay set, we expect full batches to be executed + batch_size = self.max_batch_size + self._test_helper(model_name, batch_size, self._send_async_requests) + + def test_fail_max_queue_size(self): + model_name = "max_queue_size" + # This test checks whether metrics are properly accounts for requests + # that fail to enqueue on the server. The test sets the max_queue_size + # and any additional requests beyond the specified queue size should fail + # instead of waiting for execution. + batch_size = self.max_batch_size + self._test_helper( + model_name, batch_size, self._send_async_requests, max_queue_size=4 + ) + + def test_sequence_batch_scheduler_direct(self): + model_name = "sequence_direct" + # With sufficient queue delay and minimum_slot_utilization set, we + # expect full batches to be executed. + batch_size = self.max_batch_size + num_seq_slots = batch_size + send_requests_func = partial(self._send_async_requests_sequence, num_seq_slots) + self._test_helper(model_name, batch_size, send_requests_func) + + def test_sequence_batch_scheduler_oldest(self): + model_name = "sequence_oldest" + # With sufficient queue delay set, we expect full batches to be executed + batch_size = self.max_batch_size + num_seq_slots = batch_size + send_requests_func = partial(self._send_async_requests_sequence, num_seq_slots) + self._test_helper(model_name, batch_size, send_requests_func) + + def test_ensemble_scheduler(self): + ensemble_model_name = "ensemble" + composing_model_names = ["dynamic_composing", "default_composing"] + ensemble_queue_size = QUEUE_METRIC_TEMPLATE.format( + model_name=ensemble_model_name + ) + composing_queue_sizes = [ + QUEUE_METRIC_TEMPLATE.format(model_name=name) + for name in composing_model_names + ] + ensemble_infer_count = INFER_METRIC_TEMPLATE.format( + model_name=ensemble_model_name + ) + composing_infer_counts = [ + INFER_METRIC_TEMPLATE.format(model_name=name) + for name in composing_model_names + ] + + # Metric should be zero before sending any requests + self._assert_metric_equals(ensemble_queue_size, 0) + for queue_size in composing_queue_sizes: + self._assert_metric_equals(queue_size, 0) + # Send some ensemble requests + futures = [] + self._send_async_requests(ensemble_model_name, self.ensemble_inputs, futures) + # Give Triton time to pass some requests to composing models. This test + # is less comprehensive on checking exact queue values, and just verifies + # each composing queue gets filled and ensemble's queue is empty. + time.sleep(1) + + # Top-level ensemble size should still be zero, as all pending requests should + # be scheduled and reflected in composing models, and not considered "pending" at ensemble level. + self._assert_metric_equals(ensemble_queue_size, 0) + # Composing models should be non-zero + for queue_size in composing_queue_sizes: + self._assert_metric_greater_than(queue_size, 0) + + # Verify no inference exceptions were raised and let composing models + # finish their requests + for future in futures: + future.get_result() + + # Check that all queues are empty after getting results + self._assert_metric_equals(ensemble_queue_size, 0) + for queue_size in composing_queue_sizes: + self._assert_metric_equals(queue_size, 0) + + # Sanity check infer counts on ensemble and composing models + self._assert_metric_equals(ensemble_infer_count, self.num_requests) + for infer_count in composing_infer_counts: + self._assert_metric_equals(infer_count, self.num_requests) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_metrics/model_namespacing_repos/addsub_repo/addsub_ensemble/config.pbtxt b/qa/L0_metrics/model_namespacing_repos/addsub_repo/addsub_ensemble/config.pbtxt new file mode 100644 index 0000000000..5a8d9d56f2 --- /dev/null +++ b/qa/L0_metrics/model_namespacing_repos/addsub_repo/addsub_ensemble/config.pbtxt @@ -0,0 +1,83 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { } } + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] + +ensemble_scheduling { + step [ + { + model_name: "composing_model" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_metrics/model_namespacing_repos/addsub_repo/composing_model/1/model.py b/qa/L0_metrics/model_namespacing_repos/addsub_repo/composing_model/1/model.py new file mode 100644 index 0000000000..249b3a94c4 --- /dev/null +++ b/qa/L0_metrics/model_namespacing_repos/addsub_repo/composing_model/1/model.py @@ -0,0 +1,121 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + # Use auto complete feature to ship config.pbtxt along with the Python + # model definition + @staticmethod + def auto_complete_config(auto_complete_model_config): + # Only use packaged config if config is not explicitly provided + config = auto_complete_model_config.as_dict() + if (len(config["input"]) != 0) or (len(config["output"]) != 0): + return auto_complete_model_config + + auto_complete_model_config.add_input( + { + "name": "INPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_input( + { + "name": "INPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + return auto_complete_model_config + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + responses.append(pb_utils.InferenceResponse(self.addsub(in_0, in_1))) + return responses + + def addsub(self, in_0, in_1): + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + ) + else: + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype)) + return [out_tensor_0, out_tensor_1] diff --git a/qa/L0_metrics/model_namespacing_repos/subadd_repo/composing_model/1/model.py b/qa/L0_metrics/model_namespacing_repos/subadd_repo/composing_model/1/model.py new file mode 100644 index 0000000000..1c30cc6183 --- /dev/null +++ b/qa/L0_metrics/model_namespacing_repos/subadd_repo/composing_model/1/model.py @@ -0,0 +1,121 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + # Use auto complete feature to ship config.pbtxt along with the Python + # model definition + @staticmethod + def auto_complete_config(auto_complete_model_config): + # Only use packaged config if config is not explicitly provided + config = auto_complete_model_config.as_dict() + if (len(config["input"]) != 0) or (len(config["output"]) != 0): + return auto_complete_model_config + + auto_complete_model_config.add_input( + { + "name": "INPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_input( + { + "name": "INPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + return auto_complete_model_config + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + responses.append(pb_utils.InferenceResponse(self.subadd(in_0, in_1))) + return responses + + def subadd(self, in_0, in_1): + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + ) + else: + out_0, out_1 = ( + in_0.as_numpy() - in_1.as_numpy(), + in_0.as_numpy() + in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype)) + return [out_tensor_0, out_tensor_1] diff --git a/qa/L0_metrics/model_namespacing_repos/subadd_repo/subadd_ensemble/config.pbtxt b/qa/L0_metrics/model_namespacing_repos/subadd_repo/subadd_ensemble/config.pbtxt new file mode 100644 index 0000000000..5d549dba1d --- /dev/null +++ b/qa/L0_metrics/model_namespacing_repos/subadd_repo/subadd_ensemble/config.pbtxt @@ -0,0 +1,84 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { } } + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] + +ensemble_scheduling { + step [ + { + model_name: "composing_model" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_metrics/pinned_memory_metrics_test.py b/qa/L0_metrics/pinned_memory_metrics_test.py new file mode 100755 index 0000000000..5992219ae6 --- /dev/null +++ b/qa/L0_metrics/pinned_memory_metrics_test.py @@ -0,0 +1,179 @@ +#!/usr/bin/python +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import threading +import time +import unittest + +import numpy as np +import requests +import tritonclient.http as httpclient +from tritonclient.utils import * + +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") +# Triton server reserves 256 MB for pinned memory by default. +DEFAULT_TOTAL_PINNED_MEMORY_SIZE = 2**28 # bytes, Equivalent to 256 MB +TOTAL_PINNED_MEMORY_SIZE = int( + os.environ.get("CUSTOM_PINNED_MEMORY_POOL_SIZE", DEFAULT_TOTAL_PINNED_MEMORY_SIZE) +) +print(f"TOTAL_PINNED_MEMORY_SIZE: {TOTAL_PINNED_MEMORY_SIZE} bytes") + +# Pinned memory usage when server is idle (no inference) +DEFAULT_USED_PINNED_MEMORY_SIZE = 0 # bytes + + +def get_metrics(): + total_bytes_pattern = re.compile(r"pool_total_bytes (\d+)") + used_bytes_pattern = re.compile(r"pool_used_bytes (\d+)") + + r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics") + r.raise_for_status() + + total_bytes_match = total_bytes_pattern.search(r.text) + total_bytes_value = total_bytes_match.group(1) + + used_bytes_match = used_bytes_pattern.search(r.text) + used_bytes_value = used_bytes_match.group(1) + + return total_bytes_value, used_bytes_value + + +class TestPinnedMemoryMetrics(unittest.TestCase): + def setUp(self): + self.inference_completed = threading.Event() + + shape = [1, 16] + self.model_name = "libtorch_float32_float32_float32" + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + + self.inputs = [ + httpclient.InferInput( + "INPUT0", input0_data.shape, "FP32" + ).set_data_from_numpy(input0_data), + httpclient.InferInput( + "INPUT1", input1_data.shape, "FP32" + ).set_data_from_numpy(input1_data), + ] + + self.outputs = [ + httpclient.InferRequestedOutput("OUTPUT__0"), + httpclient.InferRequestedOutput("OUTPUT__1"), + ] + + # Before loading the model + self._assert_pinned_memory_utilization() + + def _assert_pinned_memory_utilization(self): + total_bytes_value, used_bytes_value = get_metrics() + self.assertEqual(int(total_bytes_value), TOTAL_PINNED_MEMORY_SIZE) + self.assertEqual(int(used_bytes_value), DEFAULT_USED_PINNED_MEMORY_SIZE) + + def _collect_metrics(self): + while not self.inference_completed.is_set(): + total_bytes_value, used_bytes_value = get_metrics() + self.assertEqual(int(total_bytes_value), TOTAL_PINNED_MEMORY_SIZE) + # Assert pinned memory usage is within anticipated values + self.assertIn(int(used_bytes_value), [0, 64, 128, 192, 256]) + + def test_pinned_memory_metrics_asynchronous_requests(self): + with httpclient.InferenceServerClient( + url=f"{_tritonserver_ipaddr}:8000", concurrency=10 + ) as client: + if not client.is_model_ready(self.model_name): + client.load_model(self.model_name) + + # Before starting the inference + self._assert_pinned_memory_utilization() + + # Start a thread to collect metrics asynchronously + metrics_thread = threading.Thread(target=self._collect_metrics) + metrics_thread.start() + + # Asynchronous inference requests + async_requests = [] + for _ in range(100): + async_requests.append( + client.async_infer( + model_name=self.model_name, + inputs=self.inputs, + outputs=self.outputs, + ) + ) + + time.sleep(1) + + # Wait for all inference requests to complete + for async_request in async_requests: + async_request.get_result() + + # Set the event to indicate that inference is completed + self.inference_completed.set() + + # Wait for the metrics thread to complete + metrics_thread.join() + + # After Completing inference, used_bytes_value should comedown to 0 + self._assert_pinned_memory_utilization() + + def test_pinned_memory_metrics_synchronous_requests(self): + with httpclient.InferenceServerClient( + url=f"{_tritonserver_ipaddr}:8000" + ) as client: + if not client.is_model_ready(self.model_name): + client.load_model(self.model_name) + + # Before starting the inference + self._assert_pinned_memory_utilization() + + # Start a thread to collect metrics asynchronously + metrics_thread = threading.Thread(target=self._collect_metrics) + metrics_thread.start() + + # Synchronous inference requests + for _ in range(100): + response = client.infer( + model_name=self.model_name, inputs=self.inputs, outputs=self.outputs + ) + response.get_response() + + time.sleep(0.1) + + # Set the event to indicate that inference is completed + self.inference_completed.set() + + # Wait for the metrics thread to complete + metrics_thread.join() + + # After Completing inference, used_bytes_value should comedown to 0 + self._assert_pinned_memory_utilization() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh new file mode 100755 index 0000000000..76e99e7c48 --- /dev/null +++ b/qa/L0_metrics/test.sh @@ -0,0 +1,416 @@ +#!/bin/bash +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +MODELDIR=`pwd`/models +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BASE_SERVER_ARGS="--model-repository=${MODELDIR}" +SERVER_ARGS="${BASE_SERVER_ARGS}" +SERVER_LOG="./inference_server.log" +PYTHON_TEST="metrics_config_test.py" +source ../common/util.sh + +CLIENT_LOG="client.log" +TEST_RESULT_FILE="test_results.txt" +function check_unit_test() { + if [ "${PIPESTATUS[0]}" -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + EXPECTED_NUM_TESTS="${1:-1}" + check_test_results ${TEST_RESULT_FILE} ${EXPECTED_NUM_TESTS} + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi +} + +function run_and_check_server() { + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi +} + +rm -f $SERVER_LOG +RET=0 + +if [ `ps | grep -c "tritonserver"` != "0" ]; then + echo -e "Tritonserver already running" + echo -e `ps | grep tritonserver` + exit 1 +fi + +### UNIT TESTS + +TEST_LOG="./metrics_api_test.log" +UNIT_TEST="./metrics_api_test --gtest_output=xml:metrics_api.report.xml" + +rm -fr *.log *.xml + +set +e +export CUDA_VISIBLE_DEVICES=0 +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $UNIT_TEST >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $TEST_LOG + echo -e "\n***\n*** Metrics API Unit Test Failed\n***" + RET=1 +fi +set -e + +# Prepare a libtorch float32 model with basic config +rm -rf $MODELDIR +model=libtorch_float32_float32_float32 +mkdir -p $MODELDIR/${model}/1 && \ + cp -r $DATADIR/${model}/1/* $MODELDIR/${model}/1/. && \ + cp $DATADIR/${model}/config.pbtxt $MODELDIR/${model}/. && \ + (cd $MODELDIR/${model} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + echo "instance_group [{ kind: KIND_GPU }]" >> config.pbtxt) + +### CPU / RAM metrics tests +set +e +SERVER_LOG="cpu_metrics_test_server.log" +# NOTE: CPU utilization is computed based on the metrics interval, so having +# too small of an interval can skew the results. +SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1000 --log-verbose=1" +run_and_check_server + +CLIENT_PY="./cpu_metrics_test.py" +CLIENT_LOG="cpu_metrics_test_client.log" +python3 -m pytest --junitxml="cpu_metrics.report.xml" ${CLIENT_PY} >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat ${SERVER_LOG} + cat ${CLIENT_LOG} + echo -e "\n***\n*** ${CLIENT_PY} FAILED. \n***" + RET=1 +fi + +kill_server +set -e + +### Pinned memory metrics tests +set +e +CLIENT_PY="./pinned_memory_metrics_test.py" +CLIENT_LOG="pinned_memory_metrics_test_client.log" +SERVER_LOG="pinned_memory_metrics_test_server.log" +SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG} +check_unit_test + +python3 -m pytest --junitxml="pinned_memory_metrics.report.xml" ${CLIENT_PY} >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat ${SERVER_LOG} + cat ${CLIENT_LOG} + echo -e "\n***\n*** ${CLIENT_PY} FAILED. \n***" + RET=1 +fi + +kill_server + +# Custom Pinned memory pool size +export CUSTOM_PINNED_MEMORY_POOL_SIZE=1024 # bytes +SERVER_LOG="custom_pinned_memory_test_server.log" +CLIENT_LOG="custom_pinned_memory_test_client.log" +SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1 --pinned-memory-pool-byte-size=$CUSTOM_PINNED_MEMORY_POOL_SIZE" +run_and_check_server +python3 -m pytest --junitxml="custom_pinned_memory_metrics.report.xml" ${CLIENT_PY} >> ${CLIENT_LOG} 2>&1 +if [ $? -ne 0 ]; then + cat ${SERVER_LOG} + cat ${CLIENT_LOG} + echo -e "\n***\n*** Custom ${CLIENT_PY} FAILED. \n***" + RET=1 +fi + +kill_server +set -e + +# Peer access GPU memory utilization Test +# Custom Pinned memory pool size +export CUSTOM_PINNED_MEMORY_POOL_SIZE=0 # bytes +export CUDA_VISIBLE_DEVICES=0 +SERVER_LOG="gpu_peer_memory_test_server.log" +CLIENT_LOG="gpu_peer_memory_test_client.log" + +SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1 --pinned-memory-pool-byte-size=$CUSTOM_PINNED_MEMORY_POOL_SIZE --enable-peer-access=FALSE --cuda-memory-pool-byte-size 0:0 --log-verbose=1" +run_and_check_server +#grep usage stats for triton server from nvidia-smi +memory_size_without_peering=$(nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits | grep $(pgrep tritonserver) | awk '{print $3}') + +#nvidia-smi only lists process which use gpu memory with --enable-peer-access=FALSE nvidia-smi may not list tritonserver +if [ -z $memory_size_without_peering ]; then + memory_size_without_peering=0 +fi + +kill_server + +# Check if memory usage HAS reduced to 0 after using the --enable-peer-access flag +if [ $memory_size_without_peering -ne 0 ]; then + # Print the memory usage for each GPU + echo "Disabling PEERING does not reduce GPU memory usage to ZERO" + echo -e "\n***\n*** GPU Peer enable failed. \n***" + RET=1 +fi + +### GPU Metrics +set +e +export CUDA_VISIBLE_DEVICES=0,1 +SERVER_LOG="./inference_server.log" +CLIENT_LOG="client.log" +run_and_check_server + +num_gpus=`curl -s ${TRITONSERVER_IPADDR}:8002/metrics | grep "nv_gpu_utilization{" | wc -l` +if [ $num_gpus -ne 2 ]; then + echo "Found $num_gpus GPU(s) instead of 2 GPUs being monitored." + echo -e "\n***\n*** GPU metric test failed. \n***" + RET=1 +fi + +kill_server + +export CUDA_VISIBLE_DEVICES=0 +run_and_check_server + +num_gpus=`curl -s ${TRITONSERVER_IPADDR}:8002/metrics | grep "nv_gpu_utilization{" | wc -l` +if [ $num_gpus -ne 1 ]; then + echo "Found $num_gpus GPU(s) instead of 1 GPU being monitored." + echo -e "\n***\n*** GPU metric test failed. \n***" + RET=1 +fi +kill_server + + +# Test metrics interval by querying host and checking energy +METRICS_INTERVAL_MS=500 +# Below time interval is larger than actual metrics interval in case +# the update is not ready for unexpected reason +WAIT_INTERVAL_SECS=0.6 + +SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=${METRICS_INTERVAL_MS}" +run_and_check_server + +num_iterations=10 + +# Add "warm up" iteration because in some cases the GPU metrics collection +# doesn't start immediately +prev_energy=`curl -s ${TRITONSERVER_IPADDR}:8002/metrics | awk '/nv_energy_consumption{/ {print $2}'` +for (( i = 0; i < $num_iterations; ++i )); do + sleep $WAIT_INTERVAL_SECS + current_energy=`curl -s ${TRITONSERVER_IPADDR}:8002/metrics | awk '/nv_energy_consumption{/ {print $2}'` + if [ $current_energy != $prev_energy ]; then + echo -e "\n***\n*** Detected changing metrics, warmup completed.\n***" + break + fi + prev_energy=$current_energy +done + +prev_energy=`curl -s ${TRITONSERVER_IPADDR}:8002/metrics | awk '/nv_energy_consumption{/ {print $2}'` +for (( i = 0; i < $num_iterations; ++i )); do + sleep $WAIT_INTERVAL_SECS + current_energy=`curl -s ${TRITONSERVER_IPADDR}:8002/metrics | awk '/nv_energy_consumption{/ {print $2}'` + if [ $current_energy == $prev_energy ]; then + cat $SERVER_LOG + echo "Metrics were not updated in interval of ${METRICS_INTERVAL_MS} milliseconds" + echo -e "\n***\n*** Metric Interval test failed. \n***" + RET=1 + break + fi + prev_energy=$current_energy +done + +kill_server + +### Metric Config CLI and different Metric Types ### +MODELDIR="${PWD}/unit_test_models" +mkdir -p "${MODELDIR}/identity_cache_on/1" +mkdir -p "${MODELDIR}/identity_cache_off/1" +BASE_SERVER_ARGS="--model-repository=${MODELDIR} --model-control-mode=explicit" + +# Check default settings: Counters should be enabled, summaries should be disabled +SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +kill_server + +# Enable summaries, counters still enabled by default +SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off --metrics-config summary_latencies=true" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_exist 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +kill_server + +# Enable summaries, disable counters +SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off --metrics-config summary_latencies=true --metrics-config counter_latencies=false" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_exist 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG} +check_unit_test +kill_server + +# Enable summaries and counters, check cache metrics +CACHE_ARGS="--cache-config local,size=1048576" +SERVER_ARGS="${BASE_SERVER_ARGS} ${CACHE_ARGS} --load-model=identity_cache_on --metrics-config summary_latencies=true --metrics-config counter_latencies=true" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG} +check_unit_test +# DLIS-4762: Asserts that request summary is not published when cache is +# enabled for a model, until this if fixed. +python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_exist_with_cache 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_exist 2>&1 | tee ${CLIENT_LOG} +check_unit_test +python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_exist 2>&1 | tee ${CLIENT_LOG} +check_unit_test +kill_server + +# Check setting custom summary quantiles +export SUMMARY_QUANTILES="0.1:0.0.1,0.7:0.01,0.75:0.01" +SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off --metrics-config summary_latencies=true --metrics-config summary_quantiles=${SUMMARY_QUANTILES}" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_summaries_custom_quantiles 2>&1 | tee ${CLIENT_LOG} +check_unit_test +kill_server + +# Check model namespacing label with namespace on and off +REPOS_DIR="${PWD}/model_namespacing_repos" +mkdir -p "${REPOS_DIR}/addsub_repo/addsub_ensemble/1" +mkdir -p "${REPOS_DIR}/subadd_repo/subadd_ensemble/1" +# Namespace on +SERVER_ARGS="--model-repository=${REPOS_DIR}/addsub_repo --model-repository=${REPOS_DIR}/subadd_repo --model-namespacing=true --allow-metrics=true" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_model_namespacing_label_with_namespace_on 2>&1 | tee ${CLIENT_LOG} +check_unit_test +kill_server +# Namespace off +SERVER_ARGS="--model-repository=${REPOS_DIR}/addsub_repo --model-namespacing=false --allow-metrics=true" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_model_namespacing_label_with_namespace_off 2>&1 | tee ${CLIENT_LOG} +check_unit_test +kill_server + +### Pending Request Count (Queue Size) Metric Behavioral Tests ### +MODELDIR="${PWD}/queue_size_models" +SERVER_ARGS="--model-repository=${MODELDIR} --log-verbose=1" +PYTHON_TEST="metrics_queue_size_test.py" +rm -rf "${MODELDIR}" +mkdir -p "${MODELDIR}" + +# Re-use an identity model that sleeps during execution for N seconds for the +# batch of requests. Then we can confirm queue size behaviors for various +# scheduling/batching strategies. +BASE_MODEL="identity_delay" +# Don't use special debug env var for this, just set sufficient parameters for +# each scheduler to let them fill batches when possible. +unset TRITONSERVER_DELAY_SCHEDULER +export MAX_BATCH_SIZE=4 +# Delay up to 100ms to form batches up to MAX_BATCH_SIZE +export MAX_QUEUE_DELAY_US=100000 + +# Create a model per scheduler type +DEFAULT_MODEL="${MODELDIR}/default" +cp -r "${BASE_MODEL}" "${DEFAULT_MODEL}" +mkdir -p "${DEFAULT_MODEL}/1" +sed -i "s/^max_batch_size.*/max_batch_size: ${MAX_BATCH_SIZE}/" "${DEFAULT_MODEL}/config.pbtxt" + +DYNAMIC_MODEL="${MODELDIR}/dynamic" +cp -r "${DEFAULT_MODEL}" "${DYNAMIC_MODEL}" +echo -e "\ndynamic_batching { max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_US} }\n" >> "${DYNAMIC_MODEL}/config.pbtxt" + +MAX_QUEUE_SIZE_MODEL="${MODELDIR}/max_queue_size" +cp -r "${DEFAULT_MODEL}" "${MAX_QUEUE_SIZE_MODEL}" +echo -e "\ndynamic_batching { max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_US} default_queue_policy { max_queue_size: 4 } }\n" >> "${MAX_QUEUE_SIZE_MODEL}/config.pbtxt" + +SEQUENCE_DIRECT_MODEL="${MODELDIR}/sequence_direct" +cp -r "${DEFAULT_MODEL}" "${SEQUENCE_DIRECT_MODEL}" +echo -e "\nsequence_batching { direct { max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_US}, minimum_slot_utilization: 1.0 } }\n" >> "${SEQUENCE_DIRECT_MODEL}/config.pbtxt" + +SEQUENCE_OLDEST_MODEL="${MODELDIR}/sequence_oldest" +cp -r "${DEFAULT_MODEL}" "${SEQUENCE_OLDEST_MODEL}" +echo -e "\nsequence_batching { oldest { max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_US}, max_candidate_sequences: ${MAX_BATCH_SIZE} } }\n" >> "${SEQUENCE_OLDEST_MODEL}/config.pbtxt" + +BASE_ENSEMBLE="ensemble_delay" +ENSEMBLE_MODEL="${MODELDIR}/ensemble" +cp -r "${BASE_ENSEMBLE}" "${ENSEMBLE_MODEL}" +mkdir -p "${ENSEMBLE_MODEL}/1" +# Use uniquely named composing models to avoid clashing +# metric values with individual and ensemble tests. +cp -r "${DEFAULT_MODEL}" "${MODELDIR}/default_composing" +cp -r "${DYNAMIC_MODEL}" "${MODELDIR}/dynamic_composing" + + +run_and_check_server +python3 ${PYTHON_TEST} 2>&1 | tee ${CLIENT_LOG} +kill_server +expected_tests=6 +check_unit_test "${expected_tests}" + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET + diff --git a/qa/L0_metrics/unit_test_models/identity_cache_off/config.pbtxt b/qa/L0_metrics/unit_test_models/identity_cache_off/config.pbtxt new file mode 100644 index 0000000000..863c35df07 --- /dev/null +++ b/qa/L0_metrics/unit_test_models/identity_cache_off/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +response_cache { + enable: false +} diff --git a/qa/L0_metrics/unit_test_models/identity_cache_on/config.pbtxt b/qa/L0_metrics/unit_test_models/identity_cache_on/config.pbtxt new file mode 100644 index 0000000000..4bf5a7ef3b --- /dev/null +++ b/qa/L0_metrics/unit_test_models/identity_cache_on/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +response_cache { + enable: true +} diff --git a/qa/L0_mlflow/plugin_test.py b/qa/L0_mlflow/plugin_test.py new file mode 100755 index 0000000000..a5d87a3c19 --- /dev/null +++ b/qa/L0_mlflow/plugin_test.py @@ -0,0 +1,121 @@ +#!/usr/bin/python + +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json +import unittest + +import numpy as np +import test_util as tu +from mlflow.deployments import get_deploy_client + + +class PluginTest(tu.TestResultCollector): + def setUp(self): + self.client_ = get_deploy_client("triton") + + def _validate_deployment(self, model_name): + # create + self.client_.create_deployment( + model_name, "models:/{}/1".format(model_name), flavor="onnx" + ) + + # list + deployment_list = self.client_.list_deployments() + self.assertEqual(len(deployment_list), 1) + self.assertEqual(deployment_list[0]["name"], model_name) + + # get + deployment = self.client_.get_deployment(model_name) + self.assertEqual(deployment["name"], model_name) + + # predict + inputs = {} + with open("./mlflow-triton-plugin/examples/input.json", "r") as f: + input_json = json.load(f) + for key, value in input_json["inputs"].items(): + inputs[key] = np.array(value, dtype=np.float32) + + output = self.client_.predict(model_name, inputs) + with open("./mlflow-triton-plugin/examples/expected_output.json", "r") as f: + output_json = json.load(f) + for key, value in output_json["outputs"].items(): + np.testing.assert_allclose( + output["outputs"][key], + np.array(value, dtype=np.int32), + err_msg="Inference result is not correct", + ) + + # delete + self.client_.delete_deployment(model_name) + + def test_onnx_flavor(self): + # Log the ONNX model to MLFlow + import mlflow.onnx + import onnx + + model = onnx.load( + "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx" + ) + # Use a different name to ensure the plugin operates on correct model + mlflow.onnx.log_model(model, "triton", registered_model_name="onnx_model") + + self._validate_deployment("onnx_model") + + def test_onnx_flavor_with_files(self): + # Log the ONNX model and additional Triton config file to MLFlow + import mlflow.onnx + import onnx + + model = onnx.load( + "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx" + ) + config_path = ( + "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt" + ) + # Use a different name to ensure the plugin operates on correct model + mlflow.onnx.log_model( + model, "triton", registered_model_name="onnx_model_with_files" + ) + mlflow.log_artifact(config_path, "triton") + + self._validate_deployment("onnx_model_with_files") + + # Check if the additional files are properly copied + import filecmp + + self.assertTrue( + filecmp.cmp(config_path, "./models/onnx_model_with_files/config.pbtxt") + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_mlflow/test.sh b/qa/L0_mlflow/test.sh new file mode 100755 index 0000000000..4b5205ba25 --- /dev/null +++ b/qa/L0_mlflow/test.sh @@ -0,0 +1,276 @@ +#!/bin/bash +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +source ../common/util.sh + +rm -fr *.log *.json + +# The default version of python 3.10.6 included in +# Ubuntu 22.04 installs blinker 1.4. This doesn't +# work with the awscli which we try to install. +# Uninstalling blinker and allowing pip to install blinker 1.6 +# fixes this issue. The alternative to this is to +# install a higher version of python which uses blinker 1.6, +# but it is unknown whether this test should rely on +# the default installation of python. +apt remove -y python3-blinker + +RET=0 + +# Set up MLflow and dependencies used by the test +pip install mlflow onnx onnxruntime boto3 + +# Install AWS CLI +if ! command -v aws --version &> /dev/null; then + curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" + unzip awscliv2.zip + ./aws/install + rm -r ./aws/ ./awscliv2.zip +fi + +# Set environment variables for MLFlow and Triton plugin +export MLFLOW_MODEL_REPO=./mlflow/artifacts +export MLFLOW_TRACKING_URI=sqlite:////tmp/mlflow-db.sqlite +export TRITON_URL=localhost:8000 +export TRITON_MODEL_REPO=models +mkdir -p ./mlflow/artifacts + +pip install ./mlflow-triton-plugin/ + +# Clear mlflow registered models if any +python - << EOF +from mlflow.tracking import MlflowClient +c = MlflowClient() +for m in c.search_registered_models(): + c.delete_registered_model(m.name) +EOF + +rm -rf ./models +mkdir -p ./models +# Put some models in model repository to make sure MLFlow plugin would ignore +# model that is not registered via MLFlow +cp -r ./mlflow-triton-plugin/examples/onnx_float32_int32_int32 ./models/existing_model + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=./models --strict-model-config=false --model-control-mode=explicit --load-model=*" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** fail to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Triton flavor with CLI +set +e +CLI_LOG=plugin_cli.log +CLI_RET=0 +python ./mlflow-triton-plugin/scripts/publish_model_to_mlflow.py \ + --model_name onnx_float32_int32_int32 \ + --model_directory ./mlflow-triton-plugin/examples/onnx_float32_int32_int32/ \ + --flavor triton >>$CLI_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Expect 'triton' flavor model is logged to MLFlow\n***" + CLI_RET=1 +fi +if [ $CLI_RET -eq 0 ]; then + mlflow deployments create -t triton --flavor triton \ + --name onnx_float32_int32_int32 -m models:/onnx_float32_int32_int32/1 >>$CLI_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Expect 'triton' flavor model is deployed via MLFlow\n***" + CLI_RET=1 + fi +fi +if [ $CLI_RET -eq 0 ]; then + mlflow deployments list -t triton >>$CLI_LOG 2>&1 + if [ $? -ne 0 ]; then + CLI_RET=1 + fi + if [ `grep -c "onnx_float32_int32_int32.*READY" $CLI_LOG` != "1" ]; then + echo -e "\n***\n*** Expect deployed 'triton' flavor model to be listed\n***" + CLI_RET=1 + fi + if [ `grep -c "existing_model.*READY" $CLI_LOG` != "0" ]; then + echo -e "\n***\n*** Unexpected non-MLflow model listed\n***" + CLI_RET=1 + fi +fi +if [ $CLI_RET -eq 0 ]; then + mlflow deployments get -t triton --name onnx_float32_int32_int32 >>$CLI_LOG 2>&1 + if [ $? -ne 0 ]; then + CLI_RET=1 + fi + if [ `grep -c "^name: onnx_float32_int32_int32" $CLI_LOG` != "1" ]; then + echo -e "\n***\n*** Expect deployed 'triton' flavor model is found\n***" + CLI_RET=1 + fi +fi +if [ $CLI_RET -eq 0 ]; then + mlflow deployments predict -t triton --name onnx_float32_int32_int32 --input-path ./mlflow-triton-plugin/examples/input.json --output-path output.json >>$CLI_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Expect successful 'triton' flavor model prediction\n***" + CLI_RET=1 + fi + python - << EOF +import json +with open("./output.json", "r") as f: + output = json.load(f) +with open("./mlflow-triton-plugin/examples/expected_output.json", "r") as f: + expected_output = json.load(f) +if output == expected_output: + exit(0) +else: + exit(1) +EOF + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Expect 'triton' flavor model prediction matches expected output\n***" + echo -e "Expect:\n" + cat ./mlflow-triton-plugin/examples/expected_output.json + echo -e "\n\nGot:\n" + cat output.json + CLI_RET=1 + fi +fi +if [ $CLI_RET -eq 0 ]; then + mlflow deployments delete -t triton --name onnx_float32_int32_int32 >>$CLI_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Expect successful deletion of 'triton' flavor model\n***" + CLI_RET=1 + fi +fi +if [ $CLI_RET -ne 0 ]; then + cat $CLI_LOG + echo -e "\n***\n*** MLFlow Triton plugin CLI Test FAILED\n***" + RET=1 +fi +set -e + +# ONNX flavor with Python package +set +e +PY_LOG=plugin_py.log +PY_TEST=plugin_test.py +TEST_RESULT_FILE='test_results.txt' +python $PY_TEST >>$PY_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $SERVER_LOG + cat $PY_LOG + echo -e "\n***\n*** Python Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $PY_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill_server + + +# +# Test S3, the setup is duplicated from L0_storage_S3, except the bucket is +# created empty +# + +# Clear mlflow registered models if any +python - << EOF +from mlflow.tracking import MlflowClient +c = MlflowClient() +for m in c.search_registered_models(): + c.delete_registered_model(m.name) +EOF + +# S3 credentials are necessary for this test. Pass via ENV variables +aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + +# S3 bucket path (Point to bucket when testing cloud storage) +BUCKET_URL="s3://triton-bucket-${CI_JOB_ID}" + +# Cleanup and delete S3 test bucket if it already exists (due to test failure) +aws s3 rm $BUCKET_URL --recursive --include "*" && \ + aws s3 rb $BUCKET_URL || true + +# Make S3 test bucket +aws s3 mb "${BUCKET_URL}" + +# Remove Slash in BUCKET_URL +BUCKET_URL=${BUCKET_URL%/} +BUCKET_URL_SLASH="${BUCKET_URL}/" + +export TRITON_MODEL_REPO=${BUCKET_URL} +SERVER_ARGS="--model-repository=${TRITON_MODEL_REPO} --model-control-mode=explicit" +SERVER_LOG="./inference_server.s3.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + # Clean up bucket contents and delete bucket before exiting test + aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" + aws s3 rb "${BUCKET_URL}" + exit 1 +fi + +# ONNX flavor with Python package +set +e +PY_LOG=plugin_py.s3.log +PY_TEST=plugin_test.py +TEST_RESULT_FILE='test_results.txt' +python $PY_TEST >>$PY_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $SERVER_LOG + cat $PY_LOG + echo -e "\n***\n*** Python Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $PY_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill_server + +# Clean up bucket contents and delete bucket +aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" +aws s3 rb "${BUCKET_URL}" + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/src/test/testdata/autofill_sanity/no_version/config.pbtxt b/qa/L0_model_config/autofill_noplatform/common/no_version/config.pbtxt similarity index 100% rename from src/test/testdata/autofill_sanity/no_version/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/common/no_version/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/common/no_version/expected b/qa/L0_model_config/autofill_noplatform/common/no_version/expected new file mode 100644 index 0000000000..94e9de9123 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/common/no_version/expected @@ -0,0 +1 @@ +Invalid model name: Could not determine backend for model 'no_version' with no backend in model configuration. Expected model name of the form 'model.'. diff --git a/qa/L0_model_config/autofill_noplatform/custom/no_delimiter/config.pbtxt b/qa/L0_model_config/autofill_noplatform/custom/no_delimiter/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform/custom/no_delimiter/expected b/qa/L0_model_config/autofill_noplatform/custom/no_delimiter/expected new file mode 100644 index 0000000000..57b8cbdc02 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/custom/no_delimiter/expected @@ -0,0 +1 @@ +Invalid model name: Could not determine backend for model 'no_delimiter' with no backend in model configuration. Expected model name of the form 'model.'. diff --git a/qa/L0_model_config/autofill_noplatform/custom/unknown_backend.unknown/config.pbtxt b/qa/L0_model_config/autofill_noplatform/custom/unknown_backend.unknown/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform/custom/unknown_backend.unknown/expected b/qa/L0_model_config/autofill_noplatform/custom/unknown_backend.unknown/expected new file mode 100644 index 0000000000..010d38a442 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/custom/unknown_backend.unknown/expected @@ -0,0 +1 @@ +Invalid argument: unable to find backend library for backend 'unknown', try specifying runtime on the model configuration. diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/circular_dependency/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/circular_dependency/config.pbtxt new file mode 100644 index 0000000000..75403bc0d8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/circular_dependency/config.pbtxt @@ -0,0 +1,33 @@ +name: "circular_dependency" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "circular_dependency_2" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/circular_dependency_2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/circular_dependency_2/config.pbtxt new file mode 100644 index 0000000000..906899dbf5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/circular_dependency_2/config.pbtxt @@ -0,0 +1,33 @@ +name: "circular_dependency_2" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "circular_dependency" + model_version: -1 + input_map { + key: "data" + value: "input" + } + output_map { + key: "prob" + value: "output" + } + } + ] +} +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/expected b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/expected new file mode 100644 index 0000000000..8aac3f2b80 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/expected @@ -0,0 +1 @@ +circular dependency between ensembles: circular_dependency -> ... -> circular_dependency_2 -> circular_dependency \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/expected_2 b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/expected_2 new file mode 100644 index 0000000000..d4e9a04222 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/circular_dependency/expected_2 @@ -0,0 +1 @@ +circular dependency between ensembles: circular_dependency_2 -> ... -> circular_dependency -> circular_dependency_2 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/ensemble_scheduling_no_set/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/ensemble_scheduling_no_set/config.pbtxt new file mode 100644 index 0000000000..6c5a2dcf0d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/ensemble_scheduling_no_set/config.pbtxt @@ -0,0 +1,18 @@ +name: "ensemble_scheduling_not_set" +max_batch_size: 8 +platform: "ensemble" +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/ensemble_scheduling_no_set/expected b/qa/L0_model_config/autofill_noplatform/ensemble/ensemble_scheduling_no_set/expected new file mode 100644 index 0000000000..aa199bba4a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/ensemble_scheduling_no_set/expected @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble ensemble_scheduling_not_set whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/has_backend/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/has_backend/config.pbtxt new file mode 100644 index 0000000000..e1b11917ba --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/has_backend/config.pbtxt @@ -0,0 +1,19 @@ +name: "has_backend" +max_batch_size: 8 +backend: "onnxruntime" +platform: "ensemble" +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/has_backend/expected b/qa/L0_model_config/autofill_noplatform/ensemble/has_backend/expected new file mode 100644 index 0000000000..2e1e35ceec --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/has_backend/expected @@ -0,0 +1 @@ +Ensemble model 'has_backend' must have platform type 'ensemble' and empty backend type \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/expected b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/expected new file mode 100644 index 0000000000..e8fe6d69cc --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/expected @@ -0,0 +1 @@ +in ensemble inconsistent_data_type, ensemble tensor data: inconsistent data type: TYPE_FP32 is inferred from model inconsistent_data_type while TYPE_INT32 is inferred from model int32_dim1_batch4 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/fp32_dim1_batch2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/fp32_dim1_batch2/config.pbtxt new file mode 100644 index 0000000000..a69fcded8b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/fp32_dim1_batch2/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch2" +max_batch_size: 2 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/inconsistent_data_type/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/inconsistent_data_type/config.pbtxt new file mode 100644 index 0000000000..246bd03fb0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/inconsistent_data_type/config.pbtxt @@ -0,0 +1,45 @@ +name: "inconsistent_data_type" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "int32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor" + } + }, + { + model_name: "fp32_dim1_batch2" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/int32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/int32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..6f07ddbc04 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_data_type/int32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "int32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/expected b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/expected new file mode 100644 index 0000000000..88887f658a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/expected @@ -0,0 +1 @@ +in ensemble inconsistent_shape, ensemble tensor temp_tensor: inconsistent shape: \[-1,16\] is inferred from model fp32_dim1_batch4 while \[-1,16,16,16\] is inferred from model fp32_dim3_batch4 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/expected_2 b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/expected_2 new file mode 100644 index 0000000000..1fb5954303 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/expected_2 @@ -0,0 +1 @@ +in ensemble inconsistent_shape, ensemble tensor temp_tensor: inconsistent shape: \[-1,16,16,16\] is inferred from model fp32_dim3_batch4 while \[-1,16\] is inferred from model fp32_dim1_batch4 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/fp32_dim3_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/fp32_dim3_batch4/config.pbtxt new file mode 100644 index 0000000000..8832b63b20 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/fp32_dim3_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim3_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16, 16, 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16, 16, 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/inconsistent_shape/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/inconsistent_shape/config.pbtxt new file mode 100644 index 0000000000..a436668f1f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/inconsistent_shape/inconsistent_shape/config.pbtxt @@ -0,0 +1,45 @@ +name: "inconsistent_shape" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor" + } + }, + { + model_name: "fp32_dim3_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/instance_group_set/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/instance_group_set/config.pbtxt new file mode 100644 index 0000000000..0dfb5058d3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/instance_group_set/config.pbtxt @@ -0,0 +1,40 @@ +name: "instance_group_set" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "model_a" + model_version: -1 + input_map { + key: "model_a_input" + value: "data" + } + output_map { + key: "model_a_output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] +instance_group [ + { + kind: KIND_GPU + gpus: [ 42 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/instance_group_set/expected b/qa/L0_model_config/autofill_noplatform/ensemble/instance_group_set/expected new file mode 100644 index 0000000000..1baf78222c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/instance_group_set/expected @@ -0,0 +1 @@ +instance group should not be specified for ensemble 'instance_group_set' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/expected b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/expected new file mode 100644 index 0000000000..9891d071a9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/expected @@ -0,0 +1 @@ +ensemble invalid_batch_size allows maximum batch size 3, but it contains model fp32_dim1_batch2 which only allows maximum batch size to be 2 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch2/config.pbtxt new file mode 100644 index 0000000000..a69fcded8b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch2/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch2" +max_batch_size: 2 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/invalid_batch_size/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/invalid_batch_size/config.pbtxt new file mode 100644 index 0000000000..89e66b0b72 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_batch_size/invalid_batch_size/config.pbtxt @@ -0,0 +1,45 @@ +name: "invalid_batch_size" +max_batch_size: 3 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor" + } + }, + { + model_name: "fp32_dim1_batch2" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/expected b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/expected new file mode 100644 index 0000000000..69e010b9c7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/expected @@ -0,0 +1 @@ +in ensemble invalid_decoupled_branching, step of model 'int32_dim1_nobatch_output2' receives inputs originated from different decoupled models \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/int32_dim1_nobatch_output2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/int32_dim1_nobatch_output2/config.pbtxt new file mode 100644 index 0000000000..2becd8e08d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/int32_dim1_nobatch_output2/config.pbtxt @@ -0,0 +1,32 @@ +name: "int32_dim1_nobatch_output2" +max_batch_size: 0 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/invalid_decoupled_branching/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/invalid_decoupled_branching/config.pbtxt new file mode 100644 index 0000000000..b02938cd30 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/invalid_decoupled_branching/config.pbtxt @@ -0,0 +1,133 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "invalid_decoupled_branching" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "repeat_1_out" + } + }, + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "repeat_2_out" + } + }, + { + model_name: "int32_dim1_nobatch_output2" + model_version: -1 + input_map { + key: "INPUT0" + value: "repeat_1_out" + } + input_map { + key: "INPUT1" + value: "repeat_2_out" + } + output_map { + key: "OUTPUT0" + value: "identity_0" + } + output_map { + key: "OUTPUT1" + value: "identity_1" + } + }, + { + model_name: "int32_dim1_nobatch_output2" + model_version: -1 + input_map { + key: "INPUT0" + value: "identity_0" + } + input_map { + key: "INPUT1" + value: "identity_1" + } + output_map { + key: "OUTPUT0" + value: "OUT" + } + } + ] +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/repeat_int32/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/repeat_int32/config.pbtxt new file mode 100644 index 0000000000..ea8955412b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching/repeat_int32/config.pbtxt @@ -0,0 +1,61 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "repeat_int32" +backend: "repeat" +max_batch_size: 0 +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "IDX" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/expected b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/expected new file mode 100644 index 0000000000..c84c6abe1a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/expected @@ -0,0 +1 @@ +in ensemble invalid_decoupled_branching_2, step of model 'invalid_decoupled_branching_2' receives inputs originated from different decoupled models \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/invalid_decoupled_branching_2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/invalid_decoupled_branching_2/config.pbtxt new file mode 100644 index 0000000000..0f448f3a31 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/invalid_decoupled_branching_2/config.pbtxt @@ -0,0 +1,102 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "invalid_decoupled_branching_2" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "OUT" + value: "OUT" + } + }, + { + model_name: "repeat_int32" + model_version: -1 + input_map { + key: "IN" + value: "IN" + } + input_map { + key: "DELAY" + value: "DELAY" + } + input_map { + key: "WAIT" + value: "WAIT" + } + output_map { + key: "IDX" + value: "IDX" + } + } + ] +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "IDX" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/repeat_int32/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/repeat_int32/config.pbtxt new file mode 100644 index 0000000000..ea8955412b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_decoupled_branching_2/repeat_int32/config.pbtxt @@ -0,0 +1,61 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "repeat_int32" +backend: "repeat" +max_batch_size: 0 +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "DELAY" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "WAIT" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "IDX" + data_type: TYPE_UINT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/expected b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/expected new file mode 100644 index 0000000000..38c7681775 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/expected @@ -0,0 +1 @@ +in ensemble invalid_input_map, ensemble tensor temp_tensor_5 is mapping to non-existing input invalid_input in model fp32_dim1_batch4_input4 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_input4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_input4/config.pbtxt new file mode 100644 index 0000000000..f534f352c9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_input4/config.pbtxt @@ -0,0 +1,52 @@ +name: "fp32_dim1_batch4_input4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_output3/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_output3/config.pbtxt new file mode 100644 index 0000000000..69b18e83e7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_output3/config.pbtxt @@ -0,0 +1,42 @@ +name: "fp32_dim1_batch4_output3" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/invalid_input_map/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/invalid_input_map/config.pbtxt new file mode 100644 index 0000000000..8bb0896d40 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_input_map/invalid_input_map/config.pbtxt @@ -0,0 +1,101 @@ +name: "invalid_input_map" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor_4" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_5" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + input_map { + key: "INPUT1" + value: "data" + } + input_map { + key: "INPUT2" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_1" + } + output_map { + key: "OUTPUT1" + value: "temp_tensor_2" + } + output_map { + key: "OUTPUT2" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor_1" + } + input_map { + key: "INPUT1" + value: "temp_tensor_2" + } + input_map { + key: "INPUT2" + value: "temp_tensor_3" + } + input_map { + key: "INPUT3" + value: "temp_tensor_4" + } + input_map { + key: "invalid_input" + value: "temp_tensor_5" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/expected b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/expected new file mode 100644 index 0000000000..d9d252e8b6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/expected @@ -0,0 +1 @@ +in ensemble invalid_output_map, ensemble tensor temp_tensor_2 is mapped from non-existing output invalid_output in model fp32_dim1_batch4_output3 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_input4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_input4/config.pbtxt new file mode 100644 index 0000000000..f534f352c9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_input4/config.pbtxt @@ -0,0 +1,52 @@ +name: "fp32_dim1_batch4_input4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_output3/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_output3/config.pbtxt new file mode 100644 index 0000000000..69b18e83e7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_output3/config.pbtxt @@ -0,0 +1,42 @@ +name: "fp32_dim1_batch4_output3" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/invalid_output_map/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/invalid_output_map/config.pbtxt new file mode 100644 index 0000000000..dd57560268 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/invalid_output_map/invalid_output_map/config.pbtxt @@ -0,0 +1,81 @@ +name: "invalid_output_map" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + input_map { + key: "INPUT1" + value: "data" + } + input_map { + key: "INPUT2" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_1" + } + output_map { + key: "invalid_output" + value: "temp_tensor_2" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor_1" + } + input_map { + key: "INPUT1" + value: "temp_tensor_2" + } + input_map { + key: "INPUT2" + value: "temp_tensor_1" + } + input_map { + key: "INPUT3" + value: "temp_tensor_4" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/model_warm_up_set/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/model_warm_up_set/config.pbtxt new file mode 100644 index 0000000000..50891243fe --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/model_warm_up_set/config.pbtxt @@ -0,0 +1,35 @@ +name: "model_warmup_set" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "model_a" + model_version: -1 + input_map { + key: "model_a_input" + value: "data" + } + output_map { + key: "model_a_output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] +model_warmup [{}] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/model_warm_up_set/expected b/qa/L0_model_config/autofill_noplatform/ensemble/model_warm_up_set/expected new file mode 100644 index 0000000000..c2323073a9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/model_warm_up_set/expected @@ -0,0 +1 @@ +model_warmup can not be specified for ensemble 'model_warmup_set' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_input_map/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_input_map/config.pbtxt new file mode 100644 index 0000000000..040c4c5d94 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_input_map/config.pbtxt @@ -0,0 +1,54 @@ +name: "no_input_map" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "model_a" + model_version: -1 + input_map { + key: "model_a_input" + value: "data" + } + output_map { + key: "model_a_output" + value: "temp_1" + } + }, + { + model_name: "model_b" + model_version: -1 + output_map { + key: "model_b_output" + value: "temp_2" + } + }, + { + model_name: "model_c" + model_version: -1 + input_map { + key: "model_c_input" + value: "temp_2" + } + output_map { + key: "model_c_output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_input_map/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_input_map/expected new file mode 100644 index 0000000000..3b1dea3fa2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_input_map/expected @@ -0,0 +1 @@ +must specify 'input_map' in step 1 of ensemble 'no_input_map' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_model_name/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_model_name/config.pbtxt new file mode 100644 index 0000000000..f312eb8ae8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_model_name/config.pbtxt @@ -0,0 +1,56 @@ +name: "no_model_name" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "model_a" + model_version: -1 + input_map { + key: "model_a_input" + value: "data" + } + output_map { + key: "model_a_output" + value: "temp_1" + } + }, + { + input_map { + key: "model_b_input" + value: "temp_1" + } + output_map { + key: "model_b_output" + value: "temp_2" + } + }, + { + model_name: "model_c" + model_version: -1 + input_map { + key: "model_c_input" + value: "temp_2" + } + output_map { + key: "model_c_output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_model_name/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_model_name/expected new file mode 100644 index 0000000000..87aba87093 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_model_name/expected @@ -0,0 +1 @@ +must specify 'model_name' in step 1 of ensemble 'no_model_name' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_output_map/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_output_map/config.pbtxt new file mode 100644 index 0000000000..50d00f7576 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_output_map/config.pbtxt @@ -0,0 +1,54 @@ +name: "no_output_map" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "model_a" + model_version: -1 + input_map { + key: "model_a_input" + value: "data" + } + output_map { + key: "model_a_output" + value: "temp_1" + } + }, + { + model_name: "model_b" + model_version: -1 + input_map { + key: "model_b_input" + value: "temp_1" + } + }, + { + model_name: "model_c" + model_version: -1 + input_map { + key: "model_c_input" + value: "temp_2" + } + output_map { + key: "model_c_output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_output_map/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_output_map/expected new file mode 100644 index 0000000000..cbbdb6c0dd --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_output_map/expected @@ -0,0 +1 @@ +must specify 'output_map' in step 1 of ensemble 'no_output_map' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/expected new file mode 100644 index 0000000000..8db4ecdb69 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/expected @@ -0,0 +1 @@ +ensemble 'no_required_version' depends on 'simple' whose required version 2 is not loaded \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/no_required_version/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/no_required_version/config.pbtxt new file mode 100644 index 0000000000..ab1c13f28c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/no_required_version/config.pbtxt @@ -0,0 +1,51 @@ +name: "no_required_version" +platform: "ensemble" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "simple" + model_version: 2 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/simple/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/simple/config.pbtxt new file mode 100644 index 0000000000..855f3d365b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version/simple/config.pbtxt @@ -0,0 +1,33 @@ +name: "simple" +backend: "identity" +max_batch_size: 8 +version_policy : { all {} } +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/expected new file mode 100644 index 0000000000..943d792015 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/expected @@ -0,0 +1 @@ +ensemble 'no_required_version_2' depends on 'simple' whose required version 2 is not loaded \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/no_required_version_2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/no_required_version_2/config.pbtxt new file mode 100644 index 0000000000..5fd69124b1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/no_required_version_2/config.pbtxt @@ -0,0 +1,71 @@ +name: "no_required_version_2" +platform: "ensemble" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "simple" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "temp0" + } + output_map { + key: "OUTPUT1" + value: "temp1" + } + }, + { + model_name: "simple" + model_version: 2 + input_map { + key: "INPUT0" + value: "temp0" + } + input_map { + key: "INPUT1" + value: "temp1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/simple/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/simple/config.pbtxt new file mode 100644 index 0000000000..855f3d365b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_2/simple/config.pbtxt @@ -0,0 +1,33 @@ +name: "simple" +backend: "identity" +max_batch_size: 8 +version_policy : { all {} } +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/expected new file mode 100644 index 0000000000..3b86973e24 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/expected @@ -0,0 +1 @@ +ensemble 'no_required_version_3' depends on 'simple' whose required version 2 is not loaded \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/no_required_version_3/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/no_required_version_3/config.pbtxt new file mode 100644 index 0000000000..5f120007d5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/no_required_version_3/config.pbtxt @@ -0,0 +1,71 @@ +name: "no_required_version_3" +platform: "ensemble" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "simple" + model_version: 2 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "temp0" + } + output_map { + key: "OUTPUT1" + value: "temp1" + } + }, + { + model_name: "simple" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp0" + } + input_map { + key: "INPUT1" + value: "temp1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/simple/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/simple/config.pbtxt new file mode 100644 index 0000000000..855f3d365b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_required_version_3/simple/config.pbtxt @@ -0,0 +1,33 @@ +name: "simple" +backend: "identity" +max_batch_size: 8 +version_policy : { all {} } +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_step/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_step/config.pbtxt new file mode 100644 index 0000000000..d4aa386243 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_step/config.pbtxt @@ -0,0 +1,22 @@ +name: "no_step" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { + step [ + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_step/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_step/expected new file mode 100644 index 0000000000..bd0d9afc31 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_step/expected @@ -0,0 +1 @@ +must specify 'step' for ensemble 'no_step' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_step_2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/no_step_2/config.pbtxt new file mode 100644 index 0000000000..aa32212f9a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_step_2/config.pbtxt @@ -0,0 +1,20 @@ +name: "no_step_2" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/no_step_2/expected b/qa/L0_model_config/autofill_noplatform/ensemble/no_step_2/expected new file mode 100644 index 0000000000..a704e750b3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/no_step_2/expected @@ -0,0 +1 @@ +must specify 'step' for ensemble 'no_step_2' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/expected b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/expected new file mode 100644 index 0000000000..09561377d9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/expected @@ -0,0 +1 @@ +ensemble non_existing_model contains models that are not available or ambiguous: fp32_dim1_batch4_input4 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4_output3/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4_output3/config.pbtxt new file mode 100644 index 0000000000..69b18e83e7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4_output3/config.pbtxt @@ -0,0 +1,42 @@ +name: "fp32_dim1_batch4_output3" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/non_existing_model/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/non_existing_model/config.pbtxt new file mode 100644 index 0000000000..69d9a3fbbe --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/non_existing_model/non_existing_model/config.pbtxt @@ -0,0 +1,85 @@ +name: "non_existing_model" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + input_map { + key: "INPUT1" + value: "data" + } + input_map { + key: "INPUT2" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_1" + } + output_map { + key: "OUTPUT1" + value: "temp_tensor_2" + } + output_map { + key: "OUTPUT2" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/optimization_set/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/optimization_set/config.pbtxt new file mode 100644 index 0000000000..28a20e5fce --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/optimization_set/config.pbtxt @@ -0,0 +1,37 @@ +name: "optimization_set" +max_batch_size: 8 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "model_a" + model_version: -1 + input_map { + key: "model_a_input" + value: "data" + } + output_map { + key: "model_a_output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] +optimization { + priority: PRIORITY_MAX +} diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/optimization_set/expected b/qa/L0_model_config/autofill_noplatform/ensemble/optimization_set/expected new file mode 100644 index 0000000000..c301936862 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/optimization_set/expected @@ -0,0 +1 @@ +optimization should not be specified for ensemble 'optimization_set' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/output_to_tensor_overmapped/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/output_to_tensor_overmapped/config.pbtxt new file mode 100644 index 0000000000..b312e545a5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/output_to_tensor_overmapped/config.pbtxt @@ -0,0 +1,89 @@ +name: "output_to_tensor_overmapped" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_1" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output1" + value: "temp_tensor_2" + } + output_map { + key: "output2" + value: "temp_tensor_2" + } + output_map { + key: "output3" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/output_to_tensor_overmapped/expected b/qa/L0_model_config/autofill_noplatform/ensemble/output_to_tensor_overmapped/expected new file mode 100644 index 0000000000..10d49e9683 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/output_to_tensor_overmapped/expected @@ -0,0 +1 @@ +ensemble tensor 'temp_tensor_2' can appear in an output map only once for ensemble 'output_to_tensor_overmapped' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/config.pbtxt new file mode 100644 index 0000000000..7f48b18da0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/config.pbtxt @@ -0,0 +1,89 @@ +name: "redundant_tensor_as_input" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "temp_tensor_5" + } + output_map { + key: "output" + value: "temp_tensor_6" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "temp_tensor_4" + } + output_map { + key: "output1" + value: "temp_tensor_1" + } + output_map { + key: "output2" + value: "temp_tensor_2" + } + output_map { + key: "output3" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/expected b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/expected new file mode 100644 index 0000000000..ddef5813cb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/expected @@ -0,0 +1 @@ +ensemble tensor 'temp_tensor_6' is unused in ensemble 'redundant_tensor_as_input' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/expected_2 b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/expected_2 new file mode 100644 index 0000000000..7137095510 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_input/expected_2 @@ -0,0 +1 @@ +ensemble tensor 'temp_tensor_5' is unused in ensemble 'redundant_tensor_as_input' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_output/config.pbtxt new file mode 100644 index 0000000000..841218efad --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_output/config.pbtxt @@ -0,0 +1,58 @@ +name: "redundant_tensor_as_output" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_1" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "temp_tensor_1" + } + output_map { + key: "output1" + value: "prob" + } + output_map { + key: "output2" + value: "prob_2" + } + output_map { + key: "output3" + value: "temp_tensor_2" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "prob_2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_output/expected b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_output/expected new file mode 100644 index 0000000000..8e43657c42 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/redundant_tensor_as_output/expected @@ -0,0 +1 @@ +ensemble tensor 'temp_tensor_2' is unused in ensemble 'redundant_tensor_as_output' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/expected b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/expected new file mode 100644 index 0000000000..c8eda96aa3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/expected @@ -0,0 +1 @@ +circular dependency between ensembles: self_circular_dependency -> ... -> self_circular_dependency -> self_circular_dependency \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_input4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_input4/config.pbtxt new file mode 100644 index 0000000000..f534f352c9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_input4/config.pbtxt @@ -0,0 +1,52 @@ +name: "fp32_dim1_batch4_input4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_output3/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_output3/config.pbtxt new file mode 100644 index 0000000000..69b18e83e7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_output3/config.pbtxt @@ -0,0 +1,42 @@ +name: "fp32_dim1_batch4_output3" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/self_circular_dependency/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/self_circular_dependency/config.pbtxt new file mode 100644 index 0000000000..6441f2b6c3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/self_circular_dependency/self_circular_dependency/config.pbtxt @@ -0,0 +1,97 @@ +name: "self_circular_dependency" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + input_map { + key: "INPUT1" + value: "data" + } + input_map { + key: "INPUT2" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_1" + } + output_map { + key: "OUTPUT1" + value: "temp_tensor_2" + } + output_map { + key: "OUTPUT2" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor_1" + } + input_map { + key: "INPUT1" + value: "temp_tensor_2" + } + input_map { + key: "INPUT2" + value: "temp_tensor_3" + } + input_map { + key: "INPUT3" + value: "temp_tensor_4" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_5" + } + }, + { + model_name: "self_circular_dependency" + model_version: -1 + input_map { + key: "data" + value: "temp_tensor_5" + } + output_map { + key: "prob" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/tensor_to_input_overmapped/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/tensor_to_input_overmapped/config.pbtxt new file mode 100644 index 0000000000..0ea19f9d54 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/tensor_to_input_overmapped/config.pbtxt @@ -0,0 +1,93 @@ +name: "tensor_to_input_overmapped" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "temp_tensor_5" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output1" + value: "temp_tensor_1" + } + output_map { + key: "output2" + value: "temp_tensor_2" + } + output_map { + key: "output3" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_5" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/tensor_to_input_overmapped/expected b/qa/L0_model_config/autofill_noplatform/ensemble/tensor_to_input_overmapped/expected new file mode 100644 index 0000000000..da1e39d5e9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/tensor_to_input_overmapped/expected @@ -0,0 +1 @@ +ensemble tensor 'temp_tensor_5' is unused in ensemble 'tensor_to_input_overmapped' diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/expected b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/expected new file mode 100644 index 0000000000..910cda2481 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/expected @@ -0,0 +1 @@ +in ensemble unmapped_input, input INPUT0 in model fp32_dim1_batch4_input4 is not mapped to any ensemble tensors \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_input4/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_input4/config.pbtxt new file mode 100644 index 0000000000..f534f352c9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_input4/config.pbtxt @@ -0,0 +1,52 @@ +name: "fp32_dim1_batch4_input4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT3" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_output3/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_output3/config.pbtxt new file mode 100644 index 0000000000..69b18e83e7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_output3/config.pbtxt @@ -0,0 +1,42 @@ +name: "fp32_dim1_batch4_output3" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/unmapped_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/unmapped_input/config.pbtxt new file mode 100644 index 0000000000..8667da98cc --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unmapped_input/unmapped_input/config.pbtxt @@ -0,0 +1,69 @@ +name: "unmapped_input" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + input_map { + key: "INPUT1" + value: "data" + } + input_map { + key: "INPUT2" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor_2" + } + output_map { + key: "OUTPUT1" + value: "temp_tensor_3" + } + output_map { + key: "OUTPUT2" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "INPUT1" + value: "temp_tensor_2" + } + input_map { + key: "INPUT2" + value: "temp_tensor_3" + } + input_map { + key: "INPUT3" + value: "temp_tensor_4" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_input/config.pbtxt new file mode 100644 index 0000000000..3e490278ae --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_input/config.pbtxt @@ -0,0 +1,82 @@ +name: "unreachable_input" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output1" + value: "temp_tensor_1" + } + output_map { + key: "output2" + value: "temp_tensor_2" + } + output_map { + key: "output3" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "data_2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_input/expected b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_input/expected new file mode 100644 index 0000000000..891c3e3324 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_input/expected @@ -0,0 +1 @@ +ensemble input 'data_2' for ensemble unreachable_input' is not used \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output/config.pbtxt new file mode 100644 index 0000000000..e0367a66cf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output/config.pbtxt @@ -0,0 +1,82 @@ +name: "unreachable_output" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output1" + value: "temp_tensor_1" + } + output_map { + key: "output2" + value: "temp_tensor_2" + } + output_map { + key: "output3" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "prob_2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output/expected b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output/expected new file mode 100644 index 0000000000..9d193b26d0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output/expected @@ -0,0 +1 @@ +ensemble output 'prob_2' for ensemble unreachable_output' is not used \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_2/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_2/config.pbtxt new file mode 100644 index 0000000000..0d40bf1b45 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_2/config.pbtxt @@ -0,0 +1,94 @@ +name: "unreachable_output_2" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "prob_2" + } + output_map { + key: "output" + value: "prob_2" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output1" + value: "temp_tensor_1" + } + output_map { + key: "output2" + value: "temp_tensor_2" + } + output_map { + key: "output3" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "prob_2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_2/expected b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_2/expected new file mode 100644 index 0000000000..0363277f57 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_2/expected @@ -0,0 +1 @@ +output 'prob_2' for ensemble 'unreachable_output_2' is not written \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_3/config.pbtxt b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_3/config.pbtxt new file mode 100644 index 0000000000..61e5eee972 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_3/config.pbtxt @@ -0,0 +1,94 @@ +name: "unreachable_output_3" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output" + value: "temp_tensor_4" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "input" + value: "not_written_tensor" + } + output_map { + key: "output" + value: "prob_2" + } + }, + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "input" + value: "data" + } + output_map { + key: "output1" + value: "temp_tensor_1" + } + output_map { + key: "output2" + value: "temp_tensor_2" + } + output_map { + key: "output3" + value: "temp_tensor_3" + } + }, + { + model_name: "fp32_dim1_batch4_input4" + model_version: -1 + input_map { + key: "input1" + value: "temp_tensor_1" + } + input_map { + key: "input2" + value: "temp_tensor_2" + } + input_map { + key: "input3" + value: "temp_tensor_3" + } + input_map { + key: "input4" + value: "temp_tensor_4" + } + output_map { + key: "output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "prob_2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_3/expected b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_3/expected new file mode 100644 index 0000000000..f7add40dda --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/ensemble/unreachable_output_3/expected @@ -0,0 +1 @@ +output 'prob_2' for ensemble 'unreachable_output_3' is not written: at least one of its depending tensors, 'not_written_tensor', is not connected \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/1/model.onnx b/qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_dims/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/expected b/qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/expected new file mode 100644 index 0000000000..52d579417e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/bad_input_dims/expected @@ -0,0 +1 @@ +model 'bad_input_dims', tensor 'INPUT0': the model expects 2 dimensions (shape \[-1,16\]) but the model configuration specifies 3 dimensions (an initial batch dimension because max_batch_size > 0 followed by the explicit tensor shape, making complete shape \[-1,16,1\]) \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/1/model.onnx b/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/1/model.onnx new file mode 100644 index 0000000000..c9f6a92bc7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/1/model.onnx @@ -0,0 +1,33 @@ +triton:¸ + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_nobatch_int32_int8_int8Z +INPUT0 + + +Z +INPUT1 + + +b +OUTPUT0 + + +b +OUTPUT1 + + +B \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/config.pbtxt b/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/config.pbtxt new file mode 100644 index 0000000000..7d4be73dbb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/config.pbtxt @@ -0,0 +1,13 @@ +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/expected b/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/expected new file mode 100644 index 0000000000..07ebf4b459 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/bad_max_batch_size/expected @@ -0,0 +1 @@ +autofill failed for model 'bad_max_batch_size': model does not support batching while non-zero max_batch_size is specified \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/1/model.onnx b/qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_dims/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/expected b/qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/expected new file mode 100644 index 0000000000..5a11d49e68 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/bad_output_dims/expected @@ -0,0 +1 @@ +model 'bad_output_dims', tensor 'OUTPUT1': the model expects 2 dimensions (shape \[-1,16\]) but the model configuration specifies 2 dimensions (an initial batch dimension because max_batch_size > 0 followed by the explicit tensor shape, making complete shape \[-1,1\]) diff --git a/qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/1/model.onnx b/qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_few_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_few_inputs/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/expected b/qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/expected new file mode 100644 index 0000000000..f6639e85ae --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/too_few_inputs/expected @@ -0,0 +1 @@ +unable to load model 'too_few_inputs', configuration expects 1 inputs, model provides 2 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/1/model.onnx b/qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_many_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_many_inputs/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/expected b/qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/expected new file mode 100644 index 0000000000..e88e97dcfb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/too_many_inputs/expected @@ -0,0 +1 @@ +unable to load model 'too_many_inputs', configuration expects 3 inputs, model provides 2 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/onnx/unknown_input/1/model.onnx b/qa/L0_model_config/autofill_noplatform/onnx/unknown_input/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/unknown_input/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform/onnx/unknown_input/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_input/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/onnx/unknown_input/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/onnx/unknown_input/expected b/qa/L0_model_config/autofill_noplatform/onnx/unknown_input/expected new file mode 100644 index 0000000000..e2a2abbf09 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/unknown_input/expected @@ -0,0 +1 @@ +unexpected inference input 'INPUT_UNKNOWN', allowed inputs are: INPUT0, INPUT1 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/onnx/unknown_output/1/model.onnx b/qa/L0_model_config/autofill_noplatform/onnx/unknown_output/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/unknown_output/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform/onnx/unknown_output/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_output/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/onnx/unknown_output/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/onnx/unknown_output/expected b/qa/L0_model_config/autofill_noplatform/onnx/unknown_output/expected new file mode 100644 index 0000000000..38fd5e2785 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/onnx/unknown_output/expected @@ -0,0 +1 @@ +unexpected inference output 'OUTPUT_UNKNOWN', allowed outputs are: OUTPUT0, OUTPUT1 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/openvino/bad_input_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/openvino/bad_input_dims/config.pbtxt new file mode 100644 index 0000000000..87f49cf11a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/bad_input_dims/config.pbtxt @@ -0,0 +1,12 @@ +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 256 +} diff --git a/qa/L0_model_config/autofill_noplatform/openvino/bad_input_dims/expected b/qa/L0_model_config/autofill_noplatform/openvino/bad_input_dims/expected new file mode 100644 index 0000000000..bd6051f9d5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/bad_input_dims/expected @@ -0,0 +1 @@ +model 'bad_input_dims', tensor 'input1': the model expects 2 dimensions (shape \[1,4\]) but the model configuration specifies 2 dimensions (shape \[1,256\]) \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/openvino/bad_output_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/openvino/bad_output_dims/config.pbtxt new file mode 100644 index 0000000000..b177c07d18 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/bad_output_dims/config.pbtxt @@ -0,0 +1,12 @@ +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 128 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} diff --git a/qa/L0_model_config/autofill_noplatform/openvino/bad_output_dims/expected b/qa/L0_model_config/autofill_noplatform/openvino/bad_output_dims/expected new file mode 100644 index 0000000000..2f0e5be8e2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/bad_output_dims/expected @@ -0,0 +1 @@ +model 'bad_output_dims', tensor 'Func/PartitionedCall/output/_2:0': the model expects 2 dimensions (shape \[1,4\]) but the model configuration specifies 2 dimensions (shape \[1,128\]) \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/openvino/too_few_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/openvino/too_few_inputs/config.pbtxt new file mode 100644 index 0000000000..be95f0b18a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/too_few_inputs/config.pbtxt @@ -0,0 +1,6 @@ +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} diff --git a/qa/L0_model_config/autofill_noplatform/openvino/too_few_inputs/expected b/qa/L0_model_config/autofill_noplatform/openvino/too_few_inputs/expected new file mode 100644 index 0000000000..f6639e85ae --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/too_few_inputs/expected @@ -0,0 +1 @@ +unable to load model 'too_few_inputs', configuration expects 1 inputs, model provides 2 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/openvino/too_many_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/openvino/too_many_inputs/config.pbtxt new file mode 100644 index 0000000000..283f498b33 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/too_many_inputs/config.pbtxt @@ -0,0 +1,18 @@ +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input_extra" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} diff --git a/qa/L0_model_config/autofill_noplatform/openvino/too_many_inputs/expected b/qa/L0_model_config/autofill_noplatform/openvino/too_many_inputs/expected new file mode 100644 index 0000000000..e88e97dcfb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/too_many_inputs/expected @@ -0,0 +1 @@ +unable to load model 'too_many_inputs', configuration expects 3 inputs, model provides 2 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/openvino/unknown_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform/openvino/unknown_input/config.pbtxt new file mode 100644 index 0000000000..ed519869f3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/unknown_input/config.pbtxt @@ -0,0 +1,24 @@ +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "unknown_input" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} diff --git a/qa/L0_model_config/autofill_noplatform/openvino/unknown_input/expected b/qa/L0_model_config/autofill_noplatform/openvino/unknown_input/expected new file mode 100644 index 0000000000..e540422197 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/unknown_input/expected @@ -0,0 +1 @@ +unexpected inference input 'unknown_input', allowed inputs are: Func/PartitionedCall/input/_0:0, input1 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/openvino/unknown_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform/openvino/unknown_output/config.pbtxt new file mode 100644 index 0000000000..202ec57eca --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/unknown_output/config.pbtxt @@ -0,0 +1,18 @@ +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "unknown_output" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} diff --git a/qa/L0_model_config/autofill_noplatform/openvino/unknown_output/expected b/qa/L0_model_config/autofill_noplatform/openvino/unknown_output/expected new file mode 100644 index 0000000000..b374338374 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/openvino/unknown_output/expected @@ -0,0 +1 @@ +unexpected inference output 'unknown_output', allowed outputs are: Func/PartitionedCall/output/_2:0, Func/PartitionedCall/output/_3:0 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/config.pbtxt new file mode 100644 index 0000000000..e4d60bbc87 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/config.pbtxt @@ -0,0 +1,27 @@ +name: "conflicting_max_batch_size" +max_batch_size: 6 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/expected b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/expected new file mode 100644 index 0000000000..a83cc3cb6b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/expected @@ -0,0 +1 @@ +configuration specified max_batch_size 6, but in auto-complete-config function for model 'conflicting_max_batch_size' specified max_batch_size 4 diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py new file mode 100644 index 0000000000..17da02915b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/config.pbtxt new file mode 100644 index 0000000000..894908fa29 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/config.pbtxt @@ -0,0 +1,28 @@ +name: "conflicting_scheduler_sequence" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +sequence_batching: { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/expected b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/expected new file mode 100644 index 0000000000..cd931ba1d6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/expected @@ -0,0 +1 @@ +Configuration specified scheduling_choice as 'sequence_batching', but auto-complete-config function for model 'conflicting_scheduler_sequence' tries to set scheduling_choice as 'dynamic_batching' \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py new file mode 100644 index 0000000000..b1399382c4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_dynamic_batching() + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/input_mismatch_datatype/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_datatype/config.pbtxt new file mode 100644 index 0000000000..a375af8d3c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_datatype/config.pbtxt @@ -0,0 +1,26 @@ +name: "input_mismatch_datatype" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/input_mismatch_datatype/expected b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_datatype/expected new file mode 100644 index 0000000000..881f83ab7e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_datatype/expected @@ -0,0 +1 @@ +unable to load model 'input_mismatch_datatype', configuration expects datatype TYPE_INT32 for input 'INPUT1', model provides TYPE_FP32 diff --git a/qa/L0_model_config/autofill_noplatform/python/input_mismatch_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_dims/config.pbtxt new file mode 100644 index 0000000000..da27669c2a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_dims/config.pbtxt @@ -0,0 +1,26 @@ +name: "input_mismatch_dims" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/input_mismatch_dims/expected b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_dims/expected new file mode 100644 index 0000000000..f572ceb26d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_mismatch_dims/expected @@ -0,0 +1 @@ +model 'input_mismatch_dims', tensor 'INPUT1': the model expects dims \[4\] but the model configuration specifies dims \[16\] diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/config.pbtxt new file mode 100644 index 0000000000..220b9921cf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/config.pbtxt @@ -0,0 +1,26 @@ +name: "input_missing_datatype" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/expected b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/expected new file mode 100644 index 0000000000..23a402bdc0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/expected @@ -0,0 +1 @@ +input 'INPUT0' in auto-complete-config function for model 'input_missing_datatype' is missing 'data_type' property. diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py new file mode 100644 index 0000000000..cfd6aab9d6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/config.pbtxt new file mode 100644 index 0000000000..8f5e7bfd01 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/config.pbtxt @@ -0,0 +1,26 @@ +name: "input_missing_dims" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/expected b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/expected new file mode 100644 index 0000000000..02a1d955db --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/expected @@ -0,0 +1 @@ +input 'INPUT1' in auto-complete-config function for model 'input_missing_dims' is missing 'dims' property. diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py new file mode 100644 index 0000000000..8c02b4ce40 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32"} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/config.pbtxt new file mode 100644 index 0000000000..66f105f834 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/config.pbtxt @@ -0,0 +1,26 @@ +name: "input_missing_name" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/expected b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/expected new file mode 100644 index 0000000000..c3b05a0fc3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/expected @@ -0,0 +1 @@ +input in auto-complete-config function for model 'input_missing_name' is missing 'name' property. diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py new file mode 100644 index 0000000000..33a76b6b30 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/config.pbtxt new file mode 100644 index 0000000000..348928a31b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/config.pbtxt @@ -0,0 +1,26 @@ +name: "input_wrong_property" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/expected b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/expected new file mode 100644 index 0000000000..c91f4599ee --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/expected @@ -0,0 +1 @@ +input 'INPUT1' in auto-complete-config function for model 'input_wrong_property' contains property other than 'name', 'data_type', 'dims' and 'optional'. diff --git a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py new file mode 100644 index 0000000000..f3e883db06 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py @@ -0,0 +1,50 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = { + "name": "INPUT1", + "data_type": "TYPE_FP32", + "dims": [4], + "is_shape_tensor:": True, + } + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/config.pbtxt new file mode 100644 index 0000000000..3100235010 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/config.pbtxt @@ -0,0 +1,24 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/expected b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/expected new file mode 100644 index 0000000000..388c6a728d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/expected @@ -0,0 +1 @@ +model transaction property in auto-complete-config function for model 'model_transaction_policy_invalid_args' contains property other than 'decoupled' diff --git a/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/model.py b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/model.py new file mode 100644 index 0000000000..4de9d7c80a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_invalid_args/model.py @@ -0,0 +1,47 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + transaction_policy = {"invalid": "argument"} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_model_transaction_policy(transaction_policy) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/config.pbtxt new file mode 100644 index 0000000000..f8113f307e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/config.pbtxt @@ -0,0 +1,28 @@ +model_transaction_policy { + decoupled: false +} + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/expected b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/expected new file mode 100644 index 0000000000..bbdc5d2165 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/expected @@ -0,0 +1 @@ +trying to change decoupled property in auto-complete-config for model 'model_transaction_policy_mismatch', which is already set to 'False' diff --git a/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/model.py b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/model.py new file mode 100644 index 0000000000..424eca60ce --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/model_transaction_policy_mismatch/model.py @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_model_transaction_policy(dict(decoupled=True)) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/no_return/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/no_return/config.pbtxt new file mode 100644 index 0000000000..3f8526855f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/no_return/config.pbtxt @@ -0,0 +1,26 @@ +name: "no_return" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/no_return/expected b/qa/L0_model_config/autofill_noplatform/python/no_return/expected new file mode 100644 index 0000000000..17fcb4737c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/no_return/expected @@ -0,0 +1 @@ +auto_complete_config function in model 'no_return' must return a valid pb.ModelConfig object. diff --git a/qa/L0_model_config/autofill_noplatform/python/no_return/model.py b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py new file mode 100644 index 0000000000..65fae1dcc2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/output_mismatch_datatype/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_datatype/config.pbtxt new file mode 100644 index 0000000000..5b3f5f24e3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_datatype/config.pbtxt @@ -0,0 +1,26 @@ +name: "output_mismatch_datatype" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/output_mismatch_datatype/expected b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_datatype/expected new file mode 100644 index 0000000000..ad150a320d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_datatype/expected @@ -0,0 +1 @@ +unable to load model 'output_mismatch_datatype', configuration expects datatype TYPE_INT32 for output 'OUTPUT0', model provides TYPE_FP32 diff --git a/qa/L0_model_config/autofill_noplatform/python/output_mismatch_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_dims/config.pbtxt new file mode 100644 index 0000000000..ec617ed76a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_dims/config.pbtxt @@ -0,0 +1,26 @@ +name: "output_mismatch_dims" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/output_mismatch_dims/expected b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_dims/expected new file mode 100644 index 0000000000..173ee0582a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_mismatch_dims/expected @@ -0,0 +1 @@ +model 'output_mismatch_dims', tensor 'OUTPUT1': the model expects dims \[4\] but the model configuration specifies dims \[16\] diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/config.pbtxt new file mode 100644 index 0000000000..dff1f7f754 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/config.pbtxt @@ -0,0 +1,26 @@ +name: "output_missing_datatype" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/expected b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/expected new file mode 100644 index 0000000000..31e748a285 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/expected @@ -0,0 +1 @@ +output 'OUTPUT0' in auto-complete-config function for model 'output_missing_datatype' is missing 'data_type' property. diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py new file mode 100644 index 0000000000..26ef3e5c7e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/config.pbtxt new file mode 100644 index 0000000000..47c652164b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/config.pbtxt @@ -0,0 +1,26 @@ +name: "output_missing_dims" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/expected b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/expected new file mode 100644 index 0000000000..3c24750edb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/expected @@ -0,0 +1 @@ +output 'OUTPUT1' in auto-complete-config function for model 'output_missing_dims' is missing 'dims' property. diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py new file mode 100644 index 0000000000..6e43928239 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32"} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/config.pbtxt new file mode 100644 index 0000000000..17bed797d9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/config.pbtxt @@ -0,0 +1,26 @@ +name: "output_missing_name" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/expected b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/expected new file mode 100644 index 0000000000..7ad0a3a1d2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/expected @@ -0,0 +1 @@ +output in auto-complete-config function for model 'output_missing_name' is missing 'name' property. diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py new file mode 100644 index 0000000000..cde57b7827 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/config.pbtxt b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/config.pbtxt new file mode 100644 index 0000000000..21d4d12c32 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/config.pbtxt @@ -0,0 +1,26 @@ +name: "output_wrong_property" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/expected b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/expected new file mode 100644 index 0000000000..34a6a728e8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/expected @@ -0,0 +1 @@ +output 'OUTPUT1' in auto-complete-config function for model 'output_wrong_property' contains property other than 'name', 'data_type' and 'dims'. diff --git a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py new file mode 100644 index 0000000000..4dd17ea4e3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py @@ -0,0 +1,50 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = { + "name": "OUTPUT1", + "data_type": "TYPE_FP32", + "dims": [4], + "is_shape_tensor:": True, + } + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform/pytorch/too_few_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_inputs/config.pbtxt new file mode 100644 index 0000000000..df6ea8b3de --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_inputs/config.pbtxt @@ -0,0 +1,14 @@ +max_batch_size: 1 +output [ + { + name: "OUTPUT__0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "OUTPUT__1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] +backend: "pytorch" diff --git a/qa/L0_model_config/autofill_noplatform/pytorch/too_few_inputs/expected b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_inputs/expected new file mode 100644 index 0000000000..7a89bd65b4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_inputs/expected @@ -0,0 +1 @@ +unable to load model 'too_few_inputs', configuration expects 0 inputs, model provides 2 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/pytorch/too_few_outputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_outputs/config.pbtxt new file mode 100644 index 0000000000..8b93e58410 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_outputs/config.pbtxt @@ -0,0 +1,14 @@ +max_batch_size: 1 +input [ + { + name: "INPUT__0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT__1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +backend: "pytorch" diff --git a/qa/L0_model_config/autofill_noplatform/pytorch/too_few_outputs/expected b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_outputs/expected new file mode 100644 index 0000000000..872e19b2d1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/pytorch/too_few_outputs/expected @@ -0,0 +1 @@ +model configuration must contain at least one output, none were specified \ No newline at end of file diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_dims/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_dims/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/config.pbtxt new file mode 100644 index 0000000000..b393fb4e00 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16, 1 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/expected b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/expected new file mode 100644 index 0000000000..9db37f7864 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_dims/expected @@ -0,0 +1 @@ +Internal: unable to autofill for 'bad_input_dims', model tensor configurations are contradicting each other in terms of whether batching is supported \ No newline at end of file diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_type/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_type/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_type/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_type/1/model.savedmodel/saved_model.pb diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_type/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_type/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_input_type/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_type/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_type/expected b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_type/expected new file mode 100644 index 0000000000..584634b2eb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_input_type/expected @@ -0,0 +1 @@ +Invalid argument: unable to load model 'bad_input_type', configuration expects datatype TYPE_FP32 for input 'INPUT1', model provides TYPE_INT32 \ No newline at end of file diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_dims/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_dims/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/config.pbtxt new file mode 100644 index 0000000000..004ed9a54f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/expected b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/expected new file mode 100644 index 0000000000..70a0138e77 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_dims/expected @@ -0,0 +1 @@ +Invalid argument: model 'bad_output_dims', tensor 'OUTPUT1': the model expects 2 dimensions (shape \[-1,16\]) but the model configuration specifies 2 dimensions (an initial batch dimension because max_batch_size > 0 followed by the explicit tensor shape, making complete shape \[-1,1\]) \ No newline at end of file diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_type/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_type/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_type/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_type/1/model.savedmodel/saved_model.pb diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_type/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_type/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/bad_output_type/config.pbtxt rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_type/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_type/expected b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_type/expected new file mode 100644 index 0000000000..bbbe1846d1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/bad_output_type/expected @@ -0,0 +1 @@ +Invalid argument: unable to load model 'bad_output_type', configuration expects datatype TYPE_INT16 for output 'OUTPUT0', model provides TYPE_INT8 \ No newline at end of file diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/empty_config/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/too_many_inputs/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/empty_config/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/too_many_inputs/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/too_many_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/too_many_inputs/config.pbtxt new file mode 100644 index 0000000000..cee3e28b89 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/too_many_inputs/config.pbtxt @@ -0,0 +1,30 @@ +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_many_inputs/expected b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/too_many_inputs/expected similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_many_inputs/expected rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/too_many_inputs/expected diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_config/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_input/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_config/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_input/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_input/config.pbtxt new file mode 100644 index 0000000000..0df318caa8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_input/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT_UNKNOWN" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_input/expected b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_input/expected similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_input/expected rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_input/expected diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_name_platform/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_output/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_name_platform/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_output/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_output/config.pbtxt new file mode 100644 index 0000000000..979b05c4ee --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_output/config.pbtxt @@ -0,0 +1,20 @@ +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT_UNKNOWN" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_output/expected b/qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_output/expected similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_output/expected rename to qa/L0_model_config/autofill_noplatform/tensorflow_savedmodel/unknown_output/expected diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_max/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_max/config.pbtxt new file mode 100644 index 0000000000..939680951a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_max/config.pbtxt @@ -0,0 +1,24 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 33 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 33 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 33 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 33 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_max/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_max/expected new file mode 100644 index 0000000000..33630c195b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_max/expected @@ -0,0 +1 @@ +model configuration specified invalid shape for input 'INPUT0' for model bad_dynamic_shapes_max. Error details: model expected the shape of dimension 1 to be between 4 and 32 but received 33 diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_min/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_min/config.pbtxt new file mode 100644 index 0000000000..06b86f1053 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_min/config.pbtxt @@ -0,0 +1,24 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 3 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 3 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 3 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 3 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_min/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_min/expected new file mode 100644 index 0000000000..288d129df0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_dynamic_shapes_min/expected @@ -0,0 +1 @@ +model configuration specified invalid shape for input 'INPUT0' for model bad_dynamic_shapes_min. Error details: model expected the shape of dimension 1 to be between 4 and 32 but received 3 diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_dims/config.pbtxt new file mode 100644 index 0000000000..8ab5d5cc51 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_dims/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 7 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_dims/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_dims/expected new file mode 100644 index 0000000000..7cbf8a25ac --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_dims/expected @@ -0,0 +1 @@ +model 'bad_input_dims', tensor 'INPUT1': the model expects 2 dimensions (shape \[-1,16\]) but the model configuration specifies 2 dimensions (an initial batch dimension because max_batch_size > 0 followed by the explicit tensor shape, making complete shape \[-1,7\]) \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_non_linear_format_io/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_non_linear_format_io/config.pbtxt new file mode 100644 index 0000000000..535def647e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_non_linear_format_io/config.pbtxt @@ -0,0 +1,26 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + is_non_linear_format_io: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_non_linear_format_io/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_non_linear_format_io/expected new file mode 100644 index 0000000000..548c1a70e5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_non_linear_format_io/expected @@ -0,0 +1 @@ +'INPUT0' uses a linear IO format, but 'is_non_linear_format_io' is incorrectly set to true in the model configuration. diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape/config.pbtxt new file mode 100644 index 0000000000..2425ee99a7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape/config.pbtxt @@ -0,0 +1,26 @@ + +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16, 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape/expected new file mode 100644 index 0000000000..0bee80c5f4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape/expected @@ -0,0 +1 @@ +unable to autofill for 'bad_input_shape', model tensor configurations are contradicting each other in terms of whether batching is supported \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape_tensor/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape_tensor/config.pbtxt new file mode 100644 index 0000000000..f72f941ae5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape_tensor/config.pbtxt @@ -0,0 +1,26 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + is_shape_tensor: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape_tensor/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape_tensor/expected new file mode 100644 index 0000000000..a31b0168b9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_shape_tensor/expected @@ -0,0 +1 @@ +'INPUT0' is incorrectly specified as a shape tensor. diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_type/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_type/config.pbtxt new file mode 100644 index 0000000000..9a9337e334 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_type/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP16 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_type/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_type/expected new file mode 100644 index 0000000000..cd93c19cae --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_input_type/expected @@ -0,0 +1 @@ +unexpected datatype TYPE_FP32 for inference input 'INPUT0', expecting TYPE_FP16 for bad_input_type diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_dims/config.pbtxt new file mode 100644 index 0000000000..8ef432f5d6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_dims/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 7 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_dims/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_dims/expected new file mode 100644 index 0000000000..eda07bbfb8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_dims/expected @@ -0,0 +1 @@ +model 'bad_output_dims', tensor 'OUTPUT1': the model expects 2 dimensions (shape \[-1,16\]) but the model configuration specifies 2 dimensions (an initial batch dimension because max_batch_size > 0 followed by the explicit tensor shape, making complete shape \[-1,7\]) \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape/config.pbtxt new file mode 100644 index 0000000000..152a211ea7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16, 1] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape/expected new file mode 100644 index 0000000000..98bf527186 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape/expected @@ -0,0 +1 @@ +unable to autofill for 'bad_output_shape', model tensor configurations are contradicting each other in terms of whether batching is supported \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape_tensor/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape_tensor/config.pbtxt new file mode 100644 index 0000000000..46628b8592 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape_tensor/config.pbtxt @@ -0,0 +1,26 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + is_shape_tensor: true + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape_tensor/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape_tensor/expected new file mode 100644 index 0000000000..ec7ade3c8a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_shape_tensor/expected @@ -0,0 +1 @@ +'OUTPUT1' is incorrectly specified as a shape tensor. diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_type/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_type/config.pbtxt new file mode 100644 index 0000000000..d442656cd9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_type/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_type/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_type/expected new file mode 100644 index 0000000000..cf9262af4a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_output_type/expected @@ -0,0 +1 @@ +unexpected datatype TYPE_FP32 for inference output 'OUTPUT1', expecting TYPE_INT8 for bad_output_type diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_outut_non_linear_format_io/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_outut_non_linear_format_io/config.pbtxt new file mode 100644 index 0000000000..b36342c723 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_outut_non_linear_format_io/config.pbtxt @@ -0,0 +1,26 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + is_non_linear_format_io: true + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/bad_outut_non_linear_format_io/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_outut_non_linear_format_io/expected new file mode 100644 index 0000000000..d2940e317f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/bad_outut_non_linear_format_io/expected @@ -0,0 +1 @@ +'OUTPUT1' uses a linear IO format, but 'is_non_linear_format_io' is incorrectly set to true in the model configuration. diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_dims/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_dims/config.pbtxt new file mode 100644 index 0000000000..a33c5e383a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_dims/config.pbtxt @@ -0,0 +1,24 @@ +input [ + { + name: "DUMMY_INPUT0" + data_type: TYPE_FP32 + dims: [ -1,-1 ] + }, + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 2 ] + } +] +output [ + { + name: "DUMMY_OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1,-1,-1 ] + }, + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 2 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_dims/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_dims/expected new file mode 100644 index 0000000000..e36bcf627a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_dims/expected @@ -0,0 +1 @@ +model tensor configurations are contradicting each other in terms of whether batching is supported diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_shape_values/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_shape_values/config.pbtxt new file mode 100644 index 0000000000..d664a4af15 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_shape_values/config.pbtxt @@ -0,0 +1,24 @@ +input [ + { + name: "DUMMY_INPUT0" + data_type: TYPE_FP32 + dims: [ -1,-1 ] + }, + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 3 ] + } +] +output [ + { + name: "DUMMY_OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1,-1 ] + }, + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 2 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_shape_values/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_shape_values/expected new file mode 100644 index 0000000000..e36bcf627a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/mixed_batch_hint_shape_values/expected @@ -0,0 +1 @@ +model tensor configurations are contradicting each other in terms of whether batching is supported diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/too_few_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/too_few_inputs/config.pbtxt new file mode 100644 index 0000000000..eb9b9c17c2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/too_few_inputs/config.pbtxt @@ -0,0 +1,20 @@ +max_batch_size: 8 +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/too_few_inputs/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/too_few_inputs/expected new file mode 100644 index 0000000000..6226097417 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/too_few_inputs/expected @@ -0,0 +1 @@ +failed to specify the dimensions of all input tensors or values of all input shape tensors \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/too_many_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/too_many_inputs/config.pbtxt new file mode 100644 index 0000000000..85d53707b1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/too_many_inputs/config.pbtxt @@ -0,0 +1,30 @@ +max_batch_size: 8 +input [ + { + name: "INPUT_EXTRA" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/too_many_inputs/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/too_many_inputs/expected new file mode 100644 index 0000000000..9a92ec8cbf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/too_many_inputs/expected @@ -0,0 +1 @@ +unexpected inference input 'INPUT_EXTRA', allowed inputs are: INPUT0, INPUT1 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_input/config.pbtxt new file mode 100644 index 0000000000..7b1195a1b1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_input/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT_UNKNOWN" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_input/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_input/expected new file mode 100644 index 0000000000..e2a2abbf09 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_input/expected @@ -0,0 +1 @@ +unexpected inference input 'INPUT_UNKNOWN', allowed inputs are: INPUT0, INPUT1 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_output/config.pbtxt new file mode 100644 index 0000000000..09d60567ad --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_output/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT_UNKNOWN" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_output/expected b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_output/expected new file mode 100644 index 0000000000..38fd5e2785 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform/tensorrt/unknown_output/expected @@ -0,0 +1 @@ +unexpected inference output 'OUTPUT_UNKNOWN', allowed outputs are: OUTPUT0, OUTPUT1 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/custom/empty_config.identity/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/custom/empty_config.identity/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/custom/empty_config.identity/expected b/qa/L0_model_config/autofill_noplatform_success/custom/empty_config.identity/expected new file mode 100644 index 0000000000..014c6dd3ad --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/custom/empty_config.identity/expected @@ -0,0 +1,23 @@ +name: "empty_config.identity" +version_policy { +latest { + num_versions: 1 +} +} +instance_group { +name: "empty_config.identity" +count: 1 +gpus: 0 +kind: KIND_GPU +} +default_model_filename: "model.identity" +optimization { +input_pinned_memory { + enable: true +} +output_pinned_memory { + enable: true +} +} +backend: "identity" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/custom/no_backend.identity/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/custom/no_backend.identity/config.pbtxt new file mode 100644 index 0000000000..575da253a5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/custom/no_backend.identity/config.pbtxt @@ -0,0 +1,15 @@ +max_batch_size: 64 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 1000 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 1000 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/custom/no_backend.identity/expected b/qa/L0_model_config/autofill_noplatform_success/custom/no_backend.identity/expected new file mode 100644 index 0000000000..c6c3ddcd8c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/custom/no_backend.identity/expected @@ -0,0 +1,34 @@ +name: "no_backend.identity" +version_policy { +latest { + num_versions: 1 +} +} +max_batch_size: 64 +input { +name: "INPUT0" +data_type: TYPE_INT32 +dims: 1000 +} +output { +name: "OUTPUT0" +data_type: TYPE_INT32 +dims: 1000 +} +instance_group { +name: "no_backend.identity" +count: 1 +gpus: 0 +kind: KIND_GPU +} +default_model_filename: "model.identity" +optimization { +input_pinned_memory { + enable: true +} +output_pinned_memory { + enable: true +} +} +backend: "identity" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/embedded_ensemble/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/embedded_ensemble/config.pbtxt new file mode 100644 index 0000000000..87ffa565f7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/embedded_ensemble/config.pbtxt @@ -0,0 +1,33 @@ +name: "embedded_ensemble" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "inner_ensemble" + model_version: -1 + input_map { + key: "data" + value: "data" + } + output_map { + key: "prob" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/embedded_ensemble/expected b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/embedded_ensemble/expected new file mode 100644 index 0000000000..db3c2334b2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/embedded_ensemble/expected @@ -0,0 +1,40 @@ +name: "embedded_ensemble" +max_batch_size: 2 +platform: "ensemble" +version_policy { + latest { + num_versions: 1 + } +} +ensemble_scheduling { + step [ + { + model_name: "inner_ensemble" + model_version: -1 + input_map { + key: "data" + value: "data" + } + output_map { + key: "prob" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +model_transaction_policy { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/inner_ensemble/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/inner_ensemble/config.pbtxt new file mode 100644 index 0000000000..115412c088 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/embedded_ensemble/inner_ensemble/config.pbtxt @@ -0,0 +1,33 @@ +name: "inner_ensemble" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim2_nobatch/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim2_nobatch/config.pbtxt new file mode 100644 index 0000000000..54db15ca0c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim2_nobatch/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim2_nobatch" +max_batch_size: 0 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1, 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1, 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/inconsistent_shape/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/inconsistent_shape/config.pbtxt new file mode 100644 index 0000000000..26e919bcdb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/inconsistent_shape/config.pbtxt @@ -0,0 +1,45 @@ +name: "inconsistent_shape" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor" + } + }, + { + model_name: "fp32_dim2_nobatch" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/inconsistent_shape/expected b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/inconsistent_shape/expected new file mode 100644 index 0000000000..fe249bbd6c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape/inconsistent_shape/expected @@ -0,0 +1,52 @@ +name: "inconsistent_shape" +max_batch_size: 2 +platform: "ensemble" +version_policy { + latest { + num_versions: 1 + } +} +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor" + } + }, + { + model_name: "fp32_dim2_nobatch" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +model_transaction_policy { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim1_batch4/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim1_batch4/config.pbtxt new file mode 100644 index 0000000000..f5a689d604 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim1_batch4/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim1_batch4" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim2_nobatch/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim2_nobatch/config.pbtxt new file mode 100644 index 0000000000..54db15ca0c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim2_nobatch/config.pbtxt @@ -0,0 +1,22 @@ +name: "fp32_dim2_nobatch" +max_batch_size: 0 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1, 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1, 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/inconsistent_shape_2/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/inconsistent_shape_2/config.pbtxt new file mode 100644 index 0000000000..431b5b27b0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/inconsistent_shape_2/config.pbtxt @@ -0,0 +1,45 @@ +name: "inconsistent_shape_2" +max_batch_size: 0 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim2_nobatch" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 4, 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 4, 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/inconsistent_shape_2/expected b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/inconsistent_shape_2/expected new file mode 100644 index 0000000000..687eabf159 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/inconsistent_shape_2/inconsistent_shape_2/expected @@ -0,0 +1,52 @@ +name: "inconsistent_shape_2" +max_batch_size: 0 +platform: "ensemble" +version_policy { + latest { + num_versions: 1 + } +} +ensemble_scheduling { + step [ + { + model_name: "fp32_dim2_nobatch" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "temp_tensor" + } + }, + { + model_name: "fp32_dim1_batch4" + model_version: -1 + input_map { + key: "INPUT0" + value: "temp_tensor" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 4, 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 4, 16 ] + } +] +model_transaction_policy { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/fp32_dim1_batch4_output3/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/fp32_dim1_batch4_output3/config.pbtxt new file mode 100644 index 0000000000..69b18e83e7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/fp32_dim1_batch4_output3/config.pbtxt @@ -0,0 +1,42 @@ +name: "fp32_dim1_batch4_output3" +max_batch_size: 4 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/unmapped_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/unmapped_output/config.pbtxt new file mode 100644 index 0000000000..852d1b3448 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/unmapped_output/config.pbtxt @@ -0,0 +1,41 @@ +name: "unmapped_output" +max_batch_size: 2 +platform: "ensemble" +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + input_map { + key: "INPUT1" + value: "data" + } + input_map { + key: "INPUT2" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/unmapped_output/expected b/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/unmapped_output/expected new file mode 100644 index 0000000000..2f54380f3f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/ensemble/unmapped_output/unmapped_output/expected @@ -0,0 +1,48 @@ +name: "unmapped_output" +max_batch_size: 2 +platform: "ensemble" +version_policy { + latest { + num_versions: 1 + } +} +ensemble_scheduling { + step [ + { + model_name: "fp32_dim1_batch4_output3" + model_version: -1 + input_map { + key: "INPUT0" + value: "data" + } + input_map { + key: "INPUT1" + value: "data" + } + input_map { + key: "INPUT2" + value: "data" + } + output_map { + key: "OUTPUT0" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +model_transaction_policy { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt new file mode 100644 index 0000000000..137ad375c8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt @@ -0,0 +1,23 @@ + +name: "cpu_instance" +platform: "onnxruntime_onnx" +max_batch_size: 8 +version_policy: { latest { num_versions: 1 }} +input [ + { + name: "INPUT0" + data_type: TYPE_FP16 + dims: [ -1,-1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP16 + dims: [ -1,-1 ] + } +] +instance_group { + name: "cpu_instance" + kind: KIND_CPU +} diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/expected b/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/expected new file mode 100644 index 0000000000..008a7a0b7f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/expected @@ -0,0 +1,36 @@ +name: "cpu_instance" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP16 + dims: -1 + dims: -1 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP16 + dims: -1 + dims: -1 +} +instance_group { + name: "cpu_instance" + count: 2 + kind: KIND_CPU +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/1/model.onnx b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected new file mode 100644 index 0000000000..bedc4e44fa --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.1 new file mode 100644 index 0000000000..7e2a45c522 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.1 @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.2 new file mode 100644 index 0000000000..56def5c317 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.2 @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.3 new file mode 100644 index 0000000000..35a82c5be1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/empty_config/expected.3 @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/1/model.onnx b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/1/model.onnx new file mode 100644 index 0000000000..b352d3225f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:Ô + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_int32_int8_int8Z +INPUT0 + +var_0 +Z +INPUT1 + +var_0 +b +OUTPUT0 + +var_1 +b +OUTPUT1 + +var_2 +B diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected new file mode 100644 index 0000000000..f2a7d4e43e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected @@ -0,0 +1,48 @@ +name: "no_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.1 new file mode 100644 index 0000000000..ca6269959f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.1 @@ -0,0 +1,48 @@ +name: "no_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.2 new file mode 100644 index 0000000000..51d73ebdfe --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.2 @@ -0,0 +1,48 @@ +name: "no_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.3 new file mode 100644 index 0000000000..c5121d60b5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config/expected.3 @@ -0,0 +1,48 @@ +name: "no_config" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/1/model.onnx b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/1/model.onnx new file mode 100644 index 0000000000..ebe41ef108 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/1/model.onnx @@ -0,0 +1,33 @@ +TRTIS:¸ + +INPUT0_INPUT0"Identity + +INPUT1_INPUT1"Identity + +_INPUT0 +_INPUT1CAST0"Add + +_INPUT0 +_INPUT1CAST1"Sub +! +CAST0OUTPUT0"Cast* +to  +! +CAST1OUTPUT1"Cast* +to onnx_nobatch_int32_int8_int8Z +INPUT0 + + +Z +INPUT1 + + +b +OUTPUT0 + + +b +OUTPUT1 + + +B diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_config_no_batch/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_config_no_batch/config.pbtxt rename to qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected new file mode 100644 index 0000000000..9adc820017 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected @@ -0,0 +1,43 @@ +name: "no_config_no_batch" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config_no_batch_0" + count: 2 + kind: KIND_CPU +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.1 b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.1 new file mode 100644 index 0000000000..5ba1985bd6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.1 @@ -0,0 +1,43 @@ +name: "no_config_no_batch" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config_no_batch_0" + count: 2 + kind: KIND_CPU +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.2 b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.2 new file mode 100644 index 0000000000..fa82234e53 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.2 @@ -0,0 +1,43 @@ +name: "no_config_no_batch" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config_no_batch_0" + count: 2 + kind: KIND_CPU +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.3 b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.3 new file mode 100644 index 0000000000..e5e92cb9be --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/onnx/no_config_no_batch/expected.3 @@ -0,0 +1,43 @@ +name: "no_config_no_batch" +platform: "onnxruntime_onnx" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config_no_batch_0" + count: 2 + kind: KIND_CPU +} +default_model_filename: "model.onnx" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "onnxruntime" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected new file mode 100644 index 0000000000..21219a9f1b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected @@ -0,0 +1,46 @@ +name: "dynamic_batch" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 4 +} +instance_group { + name: "dynamic_batch" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.1 b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.1 new file mode 100644 index 0000000000..5e6e16e636 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.1 @@ -0,0 +1,46 @@ +name: "dynamic_batch" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "input1" + data_type: TYPE_INT32 + dims: 4 +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 4 +} +instance_group { + name: "dynamic_batch" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.2 b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.2 new file mode 100644 index 0000000000..7710b9be44 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.2 @@ -0,0 +1,46 @@ +name: "dynamic_batch" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 4 +} +instance_group { + name: "dynamic_batch" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.3 b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.3 new file mode 100644 index 0000000000..299d7286af --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/dynamic_batch/expected.3 @@ -0,0 +1,46 @@ +name: "dynamic_batch" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "input1" + data_type: TYPE_INT32 + dims: 4 +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 4 +} +instance_group { + name: "dynamic_batch" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected new file mode 100644 index 0000000000..327b6560c0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected @@ -0,0 +1,46 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.1 new file mode 100644 index 0000000000..5e21bdee82 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.1 @@ -0,0 +1,46 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.2 new file mode 100644 index 0000000000..30cbf9467b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.2 @@ -0,0 +1,46 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.3 new file mode 100644 index 0000000000..044930e8ad --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/empty_config/expected.3 @@ -0,0 +1,46 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected new file mode 100644 index 0000000000..24f54fd3ee --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected @@ -0,0 +1,46 @@ +name: "no_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "no_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.1 new file mode 100644 index 0000000000..b351867e69 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.1 @@ -0,0 +1,46 @@ +name: "no_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "no_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.2 new file mode 100644 index 0000000000..b41dfc199b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.2 @@ -0,0 +1,46 @@ +name: "no_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "no_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.3 new file mode 100644 index 0000000000..d4c9e3b6f8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/no_config/expected.3 @@ -0,0 +1,46 @@ +name: "no_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "input1" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +input { + name: "Func/PartitionedCall/input/_0:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_3:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +output { + name: "Func/PartitionedCall/output/_2:0" + data_type: TYPE_INT32 + dims: 1 + dims: 4 +} +instance_group { + name: "no_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt new file mode 100644 index 0000000000..cfdc579dae --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt @@ -0,0 +1,14 @@ +max_batch_size: 8 +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + label_filename: "output0_labels.txt" + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/expected b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/expected new file mode 100644 index 0000000000..c77e76c867 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/expected @@ -0,0 +1,44 @@ +name: "partial_config" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_INT8 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 + label_filename: "output0_labels.txt" +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "partial_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/expected.1 new file mode 100644 index 0000000000..82a1a71df9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/expected.1 @@ -0,0 +1,44 @@ +name: "partial_config" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_INT8 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 + label_filename: "output0_labels.txt" +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "partial_config" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.xml" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "openvino" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/config.pbtxt new file mode 100644 index 0000000000..4415942640 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/config.pbtxt @@ -0,0 +1,45 @@ +name: "conflicting_scheduler_ensemble" +platform: "ensemble" +input [ + { + name: "ENSEMBLE_INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "ENSEMBLE_OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +ensemble_scheduling { + step [ + { + # batch model + model_name: "ensemble_first_step" + model_version: 1 + input_map { + key: "INPUT0" + value: "ENSEMBLE_INPUT0" + } + output_map { + key: "OUTPUT0" + value: "temp_output_0" + } + }, + { + model_name: "ensemble_second_step" + model_version: 1 + input_map { + key: "INPUT0" + value: "temp_output_0" + } + output_map { + key: "OUTPUT0" + value: "ENSEMBLE_OUTPUT0" + } + } + ] +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/expected b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/expected new file mode 100644 index 0000000000..9ded94672a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/expected @@ -0,0 +1,45 @@ +name: "conflicting_scheduler_ensemble" + platform: "ensemble" + version_policy { + latest { + num_versions: 1 + } + } + input { + name: "ENSEMBLE_INPUT0" + data_type: TYPE_FP32 + dims: 4 + } + output { + name: "ENSEMBLE_OUTPUT0" + data_type: TYPE_FP32 + dims: 4 + } + ensemble_scheduling { + step { + model_name: "ensemble_first_step" + model_version: 1 + input_map { + key: "INPUT0" + value: "ENSEMBLE_INPUT0" + } + output_map { + key: "OUTPUT0" + value: "temp_output_0" + } + } + step { + model_name: "ensemble_second_step" + model_version: 1 + input_map { + key: "INPUT0" + value: "temp_output_0" + } + output_map { + key: "OUTPUT0" + value: "ENSEMBLE_OUTPUT0" + } + } + } + model_transaction_policy { + } \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py new file mode 100644 index 0000000000..57589bacdf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_dynamic_batching() + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_output(output0) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/config.pbtxt new file mode 100644 index 0000000000..bb4ad9111c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/config.pbtxt @@ -0,0 +1,15 @@ +name: "ensemble_first_step" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py new file mode 100644 index 0000000000..57589bacdf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_dynamic_batching() + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_output(output0) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/config.pbtxt new file mode 100644 index 0000000000..e1af511bc5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/config.pbtxt @@ -0,0 +1,15 @@ +name: "ensemble_second_step" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py new file mode 100644 index 0000000000..57589bacdf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_dynamic_batching() + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_output(output0) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/config.pbtxt new file mode 100644 index 0000000000..c67582c241 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/config.pbtxt @@ -0,0 +1,26 @@ +name: "dynamic_batching" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected new file mode 100644 index 0000000000..09d462cb28 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected @@ -0,0 +1,47 @@ +name: "dynamic_batching" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.1 b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.1 new file mode 100644 index 0000000000..a941ad5548 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.1 @@ -0,0 +1,47 @@ +name: "dynamic_batching" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.2 b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.2 new file mode 100644 index 0000000000..ac6a13950c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.2 @@ -0,0 +1,47 @@ +name: "dynamic_batching" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.3 b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.3 new file mode 100644 index 0000000000..f1b5c5cefd --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/expected.3 @@ -0,0 +1,47 @@ +name: "dynamic_batching" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py new file mode 100644 index 0000000000..b1399382c4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_dynamic_batching() + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/config.pbtxt new file mode 100644 index 0000000000..928ee51f50 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/config.pbtxt @@ -0,0 +1,29 @@ +name: "dynamic_batching_no_op" +max_batch_size: 4 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +dynamic_batching: { + preferred_batch_size: [ 4 ] +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected new file mode 100644 index 0000000000..515cba2485 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected @@ -0,0 +1,47 @@ +name: "dynamic_batching_no_op" +version_policy { +latest { + num_versions: 1 +} +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.1 b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.1 new file mode 100644 index 0000000000..046a58bef6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.1 @@ -0,0 +1,47 @@ +name: "dynamic_batching_no_op" +version_policy { +latest { + num_versions: 1 +} +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.2 b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.2 new file mode 100644 index 0000000000..52bc58aa9c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.2 @@ -0,0 +1,47 @@ +name: "dynamic_batching_no_op" +version_policy { +latest { + num_versions: 1 +} +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.3 b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.3 new file mode 100644 index 0000000000..8b32f40d48 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/expected.3 @@ -0,0 +1,47 @@ +name: "dynamic_batching_no_op" +version_policy { +latest { + num_versions: 1 +} +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "dynamic_batching_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +dynamic_batching { + preferred_batch_size: 4 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py new file mode 100644 index 0000000000..b1399382c4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_dynamic_batching() + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/empty_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected new file mode 100644 index 0000000000..c9d04026cc --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected @@ -0,0 +1,43 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.1 new file mode 100644 index 0000000000..57d4d9339c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.1 @@ -0,0 +1,43 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.2 new file mode 100644 index 0000000000..acedc2af71 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.2 @@ -0,0 +1,43 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.3 new file mode 100644 index 0000000000..783841c7a6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/empty_config/expected.3 @@ -0,0 +1,43 @@ +name: "empty_config" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/config.pbtxt new file mode 100644 index 0000000000..b92d44b3ec --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/config.pbtxt @@ -0,0 +1,9 @@ +name: "incomplete_input" + +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/expected b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/expected new file mode 100644 index 0000000000..ca0caa06c1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/expected @@ -0,0 +1,43 @@ +name: "incomplete_input" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py new file mode 100644 index 0000000000..75000a0ba4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_output/config.pbtxt new file mode 100644 index 0000000000..df7b925a2a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_output/config.pbtxt @@ -0,0 +1,12 @@ +name: "incomplete_output" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + }, + { + name: "OUTPUT1" + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_output/expected b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_output/expected new file mode 100644 index 0000000000..50e86ec30a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_output/expected @@ -0,0 +1,43 @@ +name: "incomplete_output" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/config.pbtxt new file mode 100644 index 0000000000..3100235010 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/config.pbtxt @@ -0,0 +1,24 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected new file mode 100644 index 0000000000..0d8cda5f0a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected @@ -0,0 +1,47 @@ +name: "model_transaction_policy" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.1 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.1 new file mode 100644 index 0000000000..8d70f0de6b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.1 @@ -0,0 +1,47 @@ +name: "model_transaction_policy" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.2 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.2 new file mode 100644 index 0000000000..d19e1f3f2e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.2 @@ -0,0 +1,47 @@ +name: "model_transaction_policy" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.3 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.3 new file mode 100644 index 0000000000..619b818fa2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/expected.3 @@ -0,0 +1,47 @@ +name: "model_transaction_policy" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/model.py b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/model.py new file mode 100644 index 0000000000..424eca60ce --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy/model.py @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_model_transaction_policy(dict(decoupled=True)) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/config.pbtxt new file mode 100644 index 0000000000..3100235010 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/config.pbtxt @@ -0,0 +1,24 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected new file mode 100644 index 0000000000..413a21bb23 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected @@ -0,0 +1,46 @@ +name: "model_transaction_policy_decoupled_false" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_decoupled_false" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.1 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.1 new file mode 100644 index 0000000000..d750463837 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.1 @@ -0,0 +1,46 @@ +name: "model_transaction_policy_decoupled_false" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_decoupled_false" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.2 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.2 new file mode 100644 index 0000000000..d82a9fc3c6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.2 @@ -0,0 +1,46 @@ +name: "model_transaction_policy_decoupled_false" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_decoupled_false" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.3 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.3 new file mode 100644 index 0000000000..ed1f10fac8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/expected.3 @@ -0,0 +1,46 @@ +name: "model_transaction_policy_decoupled_false" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_decoupled_false" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { +} \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/model.py b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/model.py new file mode 100644 index 0000000000..848af2a2b2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_decoupled_false/model.py @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_model_transaction_policy(dict(decoupled=False)) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/config.pbtxt new file mode 100644 index 0000000000..1bbf76caaf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/config.pbtxt @@ -0,0 +1,28 @@ +model_transaction_policy { + decoupled: true +} + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected new file mode 100644 index 0000000000..1c8ba1ada4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected @@ -0,0 +1,47 @@ +name: "model_transaction_policy_no_op" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.1 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.1 new file mode 100644 index 0000000000..4a854508a6 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.1 @@ -0,0 +1,47 @@ +name: "model_transaction_policy_no_op" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.2 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.2 new file mode 100644 index 0000000000..b6f5dbf368 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.2 @@ -0,0 +1,47 @@ +name: "model_transaction_policy_no_op" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.3 b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.3 new file mode 100644 index 0000000000..9c9fbf99a9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/expected.3 @@ -0,0 +1,47 @@ +name: "model_transaction_policy_no_op" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "model_transaction_policy_no_op" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" +model_transaction_policy { + decoupled: true +} diff --git a/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/model.py b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/model.py new file mode 100644 index 0000000000..424eca60ce --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/model_transaction_policy_no_op/model.py @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(4) + auto_complete_model_config.set_model_transaction_policy(dict(decoupled=True)) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/optional_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/optional_input/config.pbtxt new file mode 100644 index 0000000000..2d2868b90e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/optional_input/config.pbtxt @@ -0,0 +1,7 @@ +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/optional_input/expected b/qa/L0_model_config/autofill_noplatform_success/python/optional_input/expected new file mode 100644 index 0000000000..f298e4629c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/optional_input/expected @@ -0,0 +1,44 @@ +name: "optional_input" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 + optional: true +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "optional_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/optional_input/model.py b/qa/L0_model_config/autofill_noplatform_success/python/optional_input/model.py new file mode 100644 index 0000000000..fca8e06818 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/optional_input/model.py @@ -0,0 +1,48 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = { + "name": "INPUT0", + "data_type": "TYPE_FP32", + "dims": [4], + "optional": True, + } + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/L0_model_config/autofill_noplatform_success/python/unknown_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/unknown_input/config.pbtxt new file mode 100644 index 0000000000..c76ea45e21 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/unknown_input/config.pbtxt @@ -0,0 +1,26 @@ +name: "unknown_input" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT_UNKNOWN" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/unknown_input/expected b/qa/L0_model_config/autofill_noplatform_success/python/unknown_input/expected new file mode 100644 index 0000000000..c7da54358d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/unknown_input/expected @@ -0,0 +1,48 @@ +name: "unknown_input" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT_UNKNOWN" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "unknown_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/python/unknown_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/python/unknown_output/config.pbtxt new file mode 100644 index 0000000000..78fe02c570 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/unknown_output/config.pbtxt @@ -0,0 +1,26 @@ +name: "unknown_output" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT_UNKNOWN" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/python/unknown_output/expected b/qa/L0_model_config/autofill_noplatform_success/python/unknown_output/expected new file mode 100644 index 0000000000..7d3bf765bb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/python/unknown_output/expected @@ -0,0 +1,48 @@ +name: "unknown_output" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT_UNKNOWN" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 4 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 4 +} +instance_group { + name: "unknown_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.py" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "python" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/pytorch/cpu_instance/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/pytorch/cpu_instance/config.pbtxt new file mode 100644 index 0000000000..9d1564bed4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/pytorch/cpu_instance/config.pbtxt @@ -0,0 +1,42 @@ +name: "cpu_instance" +platform: "pytorch_libtorch" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT__0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT__1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT__0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT__1" + data_type: TYPE_INT32 + dims: 16 +} +instance_group { + name: "cpu_instance" + kind: KIND_CPU +} +default_model_filename: "model.pt" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "pytorch" diff --git a/qa/L0_model_config/autofill_noplatform_success/pytorch/cpu_instance/expected b/qa/L0_model_config/autofill_noplatform_success/pytorch/cpu_instance/expected new file mode 100644 index 0000000000..3d5cd8545f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/pytorch/cpu_instance/expected @@ -0,0 +1,44 @@ +name: "cpu_instance" +platform: "pytorch_libtorch" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT__0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT__1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT__0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT__1" + data_type: TYPE_INT32 + dims: 16 +} +instance_group { + name: "cpu_instance" + count: 1 + kind: KIND_CPU +} +default_model_filename: "model.pt" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "pytorch" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/pytorch/no_name_platform/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/pytorch/no_name_platform/config.pbtxt new file mode 100644 index 0000000000..412b42b1a3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/pytorch/no_name_platform/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT__0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT__1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT__0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT__1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/pytorch/no_name_platform/expected b/qa/L0_model_config/autofill_noplatform_success/pytorch/no_name_platform/expected new file mode 100644 index 0000000000..34a14958d4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/pytorch/no_name_platform/expected @@ -0,0 +1,45 @@ +name: "no_name_platform" +platform: "pytorch_libtorch" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT__0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT__1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT__0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT__1" + data_type: TYPE_INT32 + dims: 16 +} +instance_group { + name: "no_name_platform" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.pt" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "pytorch" +runtime: "" diff --git a/src/servables/tensorflow/testdata/graphdef_autofill_sanity/no_name_platform/1/model.graphdef b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/no_name_platform/1/model.graphdef similarity index 100% rename from src/servables/tensorflow/testdata/graphdef_autofill_sanity/no_name_platform/1/model.graphdef rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/no_name_platform/1/model.graphdef diff --git a/src/servables/tensorflow/testdata/graphdef_autofill_sanity/no_name_platform/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/no_name_platform/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/graphdef_autofill_sanity/no_name_platform/config.pbtxt rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/no_name_platform/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/no_name_platform/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/no_name_platform/expected new file mode 100644 index 0000000000..ed01acd5e0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/no_name_platform/expected @@ -0,0 +1,45 @@ +name: "no_name_platform" +platform: "tensorflow_graphdef" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_name_platform" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.graphdef" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/config.pbtxt new file mode 100644 index 0000000000..b3bc21377e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/config.pbtxt @@ -0,0 +1,41 @@ +name: "reshape_config_provided" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/expected new file mode 100644 index 0000000000..51e2d46d42 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/expected @@ -0,0 +1,59 @@ +name: "reshape_config_provided" +platform: "tensorflow_graphdef" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +instance_group { + name: "reshape_config_provided" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.graphdef" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_few_inputs/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_few_inputs/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/config.pbtxt new file mode 100644 index 0000000000..bf4222124a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/config.pbtxt @@ -0,0 +1,44 @@ +name: "cpu_instance" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "cpu_instance" + kind: KIND_CPU +} +dynamic_batching { +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected new file mode 100644 index 0000000000..f60d0950f1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected @@ -0,0 +1,47 @@ +name: "cpu_instance" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "cpu_instance" + count: 2 + kind: KIND_CPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.1 new file mode 100644 index 0000000000..dfcf1c7e89 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.1 @@ -0,0 +1,47 @@ +name: "cpu_instance" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "cpu_instance" + count: 2 + kind: KIND_CPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.2 new file mode 100644 index 0000000000..03a9721822 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.2 @@ -0,0 +1,47 @@ +name: "cpu_instance" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "cpu_instance" + count: 2 + kind: KIND_CPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.3 new file mode 100644 index 0000000000..4d69237a2e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/cpu_instance/expected.3 @@ -0,0 +1,47 @@ +name: "cpu_instance" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "cpu_instance" + count: 2 + kind: KIND_CPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_many_inputs/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/too_many_inputs/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected new file mode 100644 index 0000000000..abbc108196 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.1 new file mode 100644 index 0000000000..164b3afd2f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.1 @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.2 new file mode 100644 index 0000000000..6ad6e0d311 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.2 @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.3 new file mode 100644 index 0000000000..9298a2dc33 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/empty_config/expected.3 @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_input/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_input/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/config.pbtxt new file mode 100644 index 0000000000..e2c3c36d49 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/config.pbtxt @@ -0,0 +1,10 @@ +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + }, + { + name: "OUTPUT1" + dims: [ -1, 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected new file mode 100644 index 0000000000..5ba092a8ad --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_1" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_1" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.1 new file mode 100644 index 0000000000..a2db3d6b62 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.1 @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_1" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_1" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.2 new file mode 100644 index 0000000000..f847b58097 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.2 @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_1" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_1" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.3 new file mode 100644 index 0000000000..0e09e46e87 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_1/expected.3 @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_1" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_1" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_output/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/unknown_output/1/model.savedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/config.pbtxt new file mode 100644 index 0000000000..a4faf54369 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/config.pbtxt @@ -0,0 +1,10 @@ +output [ + { + name: "OUTPUT1" + dims: [ -1, 16 ] + }, + { + name: "OUTPUT0" + data_type: TYPE_INT8 + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected new file mode 100644 index 0000000000..137a62f2c1 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_2" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_2" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.1 new file mode 100644 index 0000000000..fd9da45fc4 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.1 @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_2" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_2" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.2 new file mode 100644 index 0000000000..efbb5a2a0c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.2 @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_2" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_2" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.3 new file mode 100644 index 0000000000..27fa02d910 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/hint_for_no_batch_2/expected.3 @@ -0,0 +1,48 @@ +name: "hint_for_no_batch_2" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch_2" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/1/model.savedmodel/saved_model.pb new file mode 100644 index 0000000000..a76abafbf7 Binary files /dev/null and b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/1/model.savedmodel/saved_model.pb differ diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/config.pbtxt new file mode 100644 index 0000000000..29ee883a4b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/config.pbtxt @@ -0,0 +1,10 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + }, + { + name: "INPUT1" + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected new file mode 100644 index 0000000000..42eb4b0821 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.1 new file mode 100644 index 0000000000..c5925abf6b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.1 @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.2 new file mode 100644 index 0000000000..0951a6ceaf --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.2 @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.3 new file mode 100644 index 0000000000..c2e88938bb --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_input/expected.3 @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/1/model.savedmodel/saved_model.pb new file mode 100644 index 0000000000..a76abafbf7 Binary files /dev/null and b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/1/model.savedmodel/saved_model.pb differ diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/config.pbtxt new file mode 100644 index 0000000000..fa9cc35967 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/config.pbtxt @@ -0,0 +1,10 @@ +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + }, + { + name: "OUTPUT1" + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected new file mode 100644 index 0000000000..2e1f32882f --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.1 new file mode 100644 index 0000000000..cf9d68e891 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.1 @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.2 new file mode 100644 index 0000000000..48deb2c7fe --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.2 @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.3 new file mode 100644 index 0000000000..c3f49cdfd7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/incomplete_output/expected.3 @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/1/model.savedmodel/saved_model.pb new file mode 100644 index 0000000000..a76abafbf7 Binary files /dev/null and b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/1/model.savedmodel/saved_model.pb differ diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/config.pbtxt new file mode 100644 index 0000000000..78cc4480b8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/config.pbtxt @@ -0,0 +1,3 @@ +instance_group { + kind: KIND_MODEL +} diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected new file mode 100644 index 0000000000..7f1b142e3b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected @@ -0,0 +1,47 @@ +name: "kind_model_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "kind_model_config_0" + count: 1 + kind: KIND_MODEL +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.1 new file mode 100644 index 0000000000..61cfcc6a23 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.1 @@ -0,0 +1,47 @@ +name: "kind_model_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "kind_model_config_0" + count: 1 + kind: KIND_MODEL +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.2 new file mode 100644 index 0000000000..4b0ddbeb8e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.2 @@ -0,0 +1,47 @@ +name: "kind_model_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "kind_model_config_0" + count: 1 + kind: KIND_MODEL +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.3 new file mode 100644 index 0000000000..abea687937 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/kind_model_config/expected.3 @@ -0,0 +1,47 @@ +name: "kind_model_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "kind_model_config_0" + count: 1 + kind: KIND_MODEL +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/1/model.savedmodel/saved_model.pb new file mode 100644 index 0000000000..a76abafbf7 Binary files /dev/null and b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/1/model.savedmodel/saved_model.pb differ diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/config.pbtxt new file mode 100644 index 0000000000..1cf214cafe --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/config.pbtxt @@ -0,0 +1 @@ +max_batch_size: 8 \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected new file mode 100644 index 0000000000..fcf0de4262 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected @@ -0,0 +1,48 @@ +name: "max_batch_size_set" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "max_batch_size_set" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.1 new file mode 100644 index 0000000000..4b1dc1abd2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.1 @@ -0,0 +1,48 @@ +name: "max_batch_size_set" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "max_batch_size_set" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.2 new file mode 100644 index 0000000000..9acbbe3f12 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.2 @@ -0,0 +1,48 @@ +name: "max_batch_size_set" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "max_batch_size_set" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.3 new file mode 100644 index 0000000000..e129508a01 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/max_batch_size_set/expected.3 @@ -0,0 +1,48 @@ +name: "max_batch_size_set" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "max_batch_size_set" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/1/model.savedmodel/saved_model.pb new file mode 100644 index 0000000000..a76abafbf7 Binary files /dev/null and b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/1/model.savedmodel/saved_model.pb differ diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected new file mode 100644 index 0000000000..2250f91f71 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected @@ -0,0 +1,48 @@ +name: "no_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.1 new file mode 100644 index 0000000000..56c1221734 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.1 @@ -0,0 +1,48 @@ +name: "no_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.2 new file mode 100644 index 0000000000..30875b1998 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.2 @@ -0,0 +1,48 @@ +name: "no_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.3 new file mode 100644 index 0000000000..469b9aff76 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config/expected.3 @@ -0,0 +1,48 @@ +name: "no_config" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 4 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 4 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_config_no_batch/1/vnetsavedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/1/model.savedmodel/saved_model.pb similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_config_no_batch/1/vnetsavedmodel/saved_model.pb rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/1/model.savedmodel/saved_model.pb diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/config.pbtxt new file mode 100644 index 0000000000..5913902a76 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/config.pbtxt @@ -0,0 +1,5 @@ +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/expected new file mode 100644 index 0000000000..165300aa9b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_config_no_batch/expected @@ -0,0 +1,41 @@ +name: "no_config_no_batch" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "input" + data_type: TYPE_FP32 + dims: 1 + dims: 256 + dims: 256 + dims: 256 + dims: 1 +} +output { + name: "output" + data_type: TYPE_FP32 + dims: 1 + dims: 256 + dims: 256 + dims: 256 + dims: 14 +} +instance_group { + name: "no_config_no_batch_0" + count: 2 + kind: KIND_CPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/1/model.savedmodel/saved_model.pb new file mode 100644 index 0000000000..a76abafbf7 Binary files /dev/null and b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/1/model.savedmodel/saved_model.pb differ diff --git a/src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_name_platform/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/config.pbtxt similarity index 100% rename from src/servables/tensorflow/testdata/savedmodel_autofill_sanity/no_name_platform/config.pbtxt rename to qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/config.pbtxt diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected new file mode 100644 index 0000000000..393000147a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected @@ -0,0 +1,45 @@ +name: "no_name_platform" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_name_platform" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.1 new file mode 100644 index 0000000000..1a9c47cca7 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.1 @@ -0,0 +1,45 @@ +name: "no_name_platform" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_name_platform" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.2 new file mode 100644 index 0000000000..c47e51aeb3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.2 @@ -0,0 +1,45 @@ +name: "no_name_platform" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_name_platform" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.3 new file mode 100644 index 0000000000..42adbbf4d3 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/no_name_platform/expected.3 @@ -0,0 +1,45 @@ +name: "no_name_platform" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "no_name_platform" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt new file mode 100644 index 0000000000..95f67e119e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt @@ -0,0 +1,34 @@ +name: "reshape_config_provided" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 8 ] + reshape: { shape: [ 4,1,2 ] } + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 8 ] + reshape: { shape: [ 4,1,2 ] } + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected new file mode 100644 index 0000000000..4fd8a8edb9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected @@ -0,0 +1,62 @@ +name: "reshape_config_provided" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +instance_group { + name: "reshape_config_provided" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.1 new file mode 100644 index 0000000000..87d646c314 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.1 @@ -0,0 +1,62 @@ +name: "reshape_config_provided" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +instance_group { + name: "reshape_config_provided" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.2 new file mode 100644 index 0000000000..3605cb1fc0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.2 @@ -0,0 +1,62 @@ +name: "reshape_config_provided" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +instance_group { + name: "reshape_config_provided" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.3 new file mode 100644 index 0000000000..c273096707 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/expected.3 @@ -0,0 +1,62 @@ +name: "reshape_config_provided" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 1 + reshape { + } +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 8 + reshape { + shape: 4 + shape: 1 + shape: 2 + } +} +instance_group { + name: "reshape_config_provided" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" \ No newline at end of file diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/1/model.savedmodel/saved_model.pb b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/1/model.savedmodel/saved_model.pb new file mode 100644 index 0000000000..a76abafbf7 Binary files /dev/null and b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/1/model.savedmodel/saved_model.pb differ diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/config.pbtxt new file mode 100644 index 0000000000..2814fb7e5c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/config.pbtxt @@ -0,0 +1,20 @@ +max_batch_size: 1 +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected new file mode 100644 index 0000000000..c41ed15143 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected @@ -0,0 +1,45 @@ +name: "too_few_inputs" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "too_few_inputs" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.1 new file mode 100644 index 0000000000..0a4b67356d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.1 @@ -0,0 +1,45 @@ +name: "too_few_inputs" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "too_few_inputs" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.2 new file mode 100644 index 0000000000..626db7022b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.2 @@ -0,0 +1,45 @@ +name: "too_few_inputs" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "too_few_inputs" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.3 new file mode 100644 index 0000000000..5c93813b17 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/too_few_inputs/expected.3 @@ -0,0 +1,45 @@ +name: "too_few_inputs" +platform: "tensorflow_savedmodel" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 1 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_INT32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT8 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_INT8 + dims: 16 +} +instance_group { + name: "too_few_inputs" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.savedmodel" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorflow" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config/expected new file mode 100644 index 0000000000..9ddf6080a8 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config/expected @@ -0,0 +1,48 @@ +name: "empty_config" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "empty_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config_variable/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config_variable/config.pbtxt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config_variable/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config_variable/expected new file mode 100644 index 0000000000..f5a6b625e9 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/empty_config_variable/expected @@ -0,0 +1,48 @@ +name: "empty_config_variable" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: -1 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: -1 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: -1 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: -1 +} +instance_group { + name: "empty_config_variable" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/hint_for_no_batch/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/hint_for_no_batch/config.pbtxt new file mode 100644 index 0000000000..50329ef203 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/hint_for_no_batch/config.pbtxt @@ -0,0 +1,21 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1, 16 ] + }, + { + name: "INPUT1" + dims: [ -1, 16 ] + } +] +output [ + { + name: "OUTPUT0" + dims: [ -1, 16 ] + }, + { + name: "OUTPUT1" + dims: [ -1, 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/hint_for_no_batch/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/hint_for_no_batch/expected new file mode 100644 index 0000000000..98f369d44e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/hint_for_no_batch/expected @@ -0,0 +1,48 @@ +name: "hint_for_no_batch" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: -1 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: -1 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: -1 + dims: 16 +} +instance_group { + name: "hint_for_no_batch" + count: 1 + gpus: 0 + kind: KIND_GPU +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/config.pbtxt new file mode 100644 index 0000000000..be47907359 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/config.pbtxt @@ -0,0 +1,10 @@ +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + }, + { + name: "INPUT1" + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected new file mode 100644 index 0000000000..24e41aabd2 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.1 new file mode 100644 index 0000000000..42071819df --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.1 @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.2 new file mode 100644 index 0000000000..75c6f0941a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.2 @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.3 new file mode 100644 index 0000000000..1f9b17efc0 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_input/expected.3 @@ -0,0 +1,48 @@ +name: "incomplete_input" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_input" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/config.pbtxt new file mode 100644 index 0000000000..ebca692e95 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/config.pbtxt @@ -0,0 +1,10 @@ +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + }, + { + name: "OUTPUT1" + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected new file mode 100644 index 0000000000..d4e3d3e39c --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.1 b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.1 new file mode 100644 index 0000000000..580a784fff --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.1 @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.2 b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.2 new file mode 100644 index 0000000000..de19771f7a --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.2 @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.3 b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.3 new file mode 100644 index 0000000000..e74c75cf26 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/incomplete_output/expected.3 @@ -0,0 +1,48 @@ +name: "incomplete_output" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "incomplete_output" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/multi_prof_max_bs/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/multi_prof_max_bs/config.pbtxt new file mode 100644 index 0000000000..c60296ee07 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/multi_prof_max_bs/config.pbtxt @@ -0,0 +1,17 @@ +instance_group [ + { + profile: "0" + } +] + +instance_group [ + { + profile: "1" + } +] + +instance_group [ + { + profile: "2" + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/multi_prof_max_bs/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/multi_prof_max_bs/expected new file mode 100644 index 0000000000..4e482e182d --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/multi_prof_max_bs/expected @@ -0,0 +1,63 @@ +name: "multi_prof_max_bs" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: -1 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: -1 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: -1 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: -1 +} +instance_group { + name: "multi_prof_max_bs_0" + count: 1 + gpus: 0 + kind: KIND_GPU + profile: "0" +} +instance_group { + name: "multi_prof_max_bs_1" + count: 1 + gpus: 0 + kind: KIND_GPU + profile: "1" +} +instance_group { + name: "multi_prof_max_bs_2" + count: 1 + gpus: 0 + kind: KIND_GPU + profile: "2" +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config/expected new file mode 100644 index 0000000000..80351a3b6e --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config/expected @@ -0,0 +1,48 @@ +name: "no_config" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "no_config" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_non_linear_format_io/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_non_linear_format_io/expected new file mode 100644 index 0000000000..7f312196e5 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_non_linear_format_io/expected @@ -0,0 +1,57 @@ +name: "no_config_non_linear_format_io" +platform: "tensorrt_plan" +backend: "tensorrt" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: -1 + dims: 2 + dims: 1 + is_non_linear_format_io: true +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: -1 + dims: 2 + dims: 1 + is_non_linear_format_io: true +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: -1 + dims: 2 + dims: 1 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: -1 + dims: 2 + dims: 1 +} +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +dynamic_batching { + preferred_batch_size: 8 +} +instance_group { + name: "no_config_non_linear_format_io" + kind: KIND_GPU + count: 1 + gpus: 0 +} +default_model_filename: "model.plan" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_shape_tensor/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_shape_tensor/expected new file mode 100644 index 0000000000..d26ded10aa --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_shape_tensor/expected @@ -0,0 +1,52 @@ +name: "no_config_shape_tensor" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_INT32 + dims: 2 + is_shape_tensor: true +} +input { + name: "DUMMY_INPUT0" + data_type: TYPE_FP32 + dims: -1 + dims: -1 +} +output { + name: "DUMMY_OUTPUT0" + data_type: TYPE_FP32 + dims: -1 + dims: -1 +} +output { + name: "OUTPUT0" + data_type: TYPE_INT64 + dims: 2 + is_shape_tensor: true +} +instance_group { + name: "no_config_shape_tensor" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_variable/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_variable/expected new file mode 100644 index 0000000000..7524614e05 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_config_variable/expected @@ -0,0 +1,48 @@ +name: "no_config_variable" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: -1 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: -1 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: -1 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: -1 +} +instance_group { + name: "no_config_variable" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform/config.pbtxt new file mode 100644 index 0000000000..b922983950 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform/expected new file mode 100644 index 0000000000..039f974c6b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform/expected @@ -0,0 +1,48 @@ +name: "no_name_platform" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "no_name_platform" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform_variable/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform_variable/config.pbtxt new file mode 100644 index 0000000000..b922983950 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform_variable/config.pbtxt @@ -0,0 +1,25 @@ +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform_variable/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform_variable/expected new file mode 100644 index 0000000000..3ec6b85a34 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/no_name_platform_variable/expected @@ -0,0 +1,48 @@ +name: "no_name_platform_variable" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 16 +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 16 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 16 +} +instance_group { + name: "no_name_platform_variable" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/reshape_config_provided/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorrt/reshape_config_provided/config.pbtxt new file mode 100644 index 0000000000..b4a9aee896 --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/reshape_config_provided/config.pbtxt @@ -0,0 +1,62 @@ +name: "reshape_config_provided" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4,4 ] + reshape: { shape: [ 2,2,4 ] } + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 2,2,4 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 2 ] + reshape: { shape: [ 1,2,1 ] } + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 1,2,1 ] + } +] +input [ + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ 2,2,3 ] + reshape: { shape: [ 3,2,2 ] } + } +] +output [ + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ 3,2,2 ] + } +] +input [ + { + name: "INPUT3" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ 1,1,1 ] } + } +] +output [ + { + name: "OUTPUT3" + data_type: TYPE_FP32 + dims: [ 1,1,1 ] + } +] diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorrt/reshape_config_provided/expected b/qa/L0_model_config/autofill_noplatform_success/tensorrt/reshape_config_provided/expected new file mode 100644 index 0000000000..015b54111b --- /dev/null +++ b/qa/L0_model_config/autofill_noplatform_success/tensorrt/reshape_config_provided/expected @@ -0,0 +1,99 @@ +name: "reshape_config_provided" +platform: "tensorrt_plan" +version_policy { + latest { + num_versions: 1 + } +} +max_batch_size: 8 +input { + name: "INPUT0" + data_type: TYPE_FP32 + dims: 4 + dims: 4 + reshape { + shape: 2 + shape: 2 + shape: 4 + } +} +input { + name: "INPUT1" + data_type: TYPE_FP32 + dims: 2 + reshape { + shape: 1 + shape: 2 + shape: 1 + } +} +input { + name: "INPUT2" + data_type: TYPE_FP32 + dims: 2 + dims: 2 + dims: 3 + reshape { + shape: 3 + shape: 2 + shape: 2 + } +} +input { + name: "INPUT3" + data_type: TYPE_FP32 + dims: 1 + reshape { + shape: 1 + shape: 1 + shape: 1 + } +} +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: 2 + dims: 2 + dims: 4 +} +output { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: 1 + dims: 2 + dims: 1 +} +output { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: 3 + dims: 2 + dims: 2 +} +output { + name: "OUTPUT3" + data_type: TYPE_FP32 + dims: 1 + dims: 1 + dims: 1 +} +instance_group { + name: "reshape_config_provided" + count: 1 + gpus: 0 + kind: KIND_GPU +} +dynamic_batching { + preferred_batch_size: 8 +} +default_model_filename: "model.plan" +optimization { + input_pinned_memory { + enable: true + } + output_pinned_memory { + enable: true + } +} +backend: "tensorrt" +runtime: "" diff --git a/qa/L0_model_config/cli_messages/cli_deprecation/expected b/qa/L0_model_config/cli_messages/cli_deprecation/expected new file mode 100644 index 0000000000..3205f6a9c2 --- /dev/null +++ b/qa/L0_model_config/cli_messages/cli_deprecation/expected @@ -0,0 +1 @@ +Warning: '--strict-model-config' has been deprecated! Please use '--disable-auto-complete-config' instead. \ No newline at end of file diff --git a/qa/L0_model_config/cli_messages/cli_override/expected b/qa/L0_model_config/cli_messages/cli_override/expected new file mode 100644 index 0000000000..51553c31ec --- /dev/null +++ b/qa/L0_model_config/cli_messages/cli_override/expected @@ -0,0 +1 @@ +Warning: Overriding deprecated '--strict-model-config' from False to True in favor of '--disable-auto-complete-config'! \ No newline at end of file diff --git a/qa/L0_model_config/compare_status.py b/qa/L0_model_config/compare_status.py new file mode 100755 index 0000000000..dbed05772a --- /dev/null +++ b/qa/L0_model_config/compare_status.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import os +import sys + +import tritonclient.grpc as grpcclient +import tritonclient.grpc.model_config_pb2 as mc +import tritonclient.http as httpclient +from google.protobuf import json_format, text_format +from tritonclient.utils import * + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--expected_dir", + type=str, + required=True, + help="Directory containing expected output files", + ) + parser.add_argument("--model", type=str, required=True, help="Model name") + FLAGS, unparsed = parser.parse_known_args() + + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + model_name = FLAGS.model + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient(url=pair[0], verbose=False) + model_config = triton_client.get_model_config(model_name) + else: + triton_client = grpcclient.InferenceServerClient(url=pair[0], verbose=False) + model_config = triton_client.get_model_config(model_name) + + nonmatch = list() + expected_files = [ + f + for f in os.listdir(FLAGS.expected_dir) + if ( + os.path.isfile(os.path.join(FLAGS.expected_dir, f)) + and (f.startswith("expected")) + ) + ] + for efile in expected_files: + with open(os.path.join(FLAGS.expected_dir, efile)) as f: + config = text_format.Parse(f.read(), mc.ModelConfig()) + + if pair[1] == "http": + config_json = json.loads( + json_format.MessageToJson(config, preserving_proto_field_name=True) + ) + if config_json == model_config: + sys.exit(0) + else: + if config == model_config.config: + sys.exit(0) + + nonmatch.append(config) + + print("Model config doesn't match any expected output:") + print("Model config:") + print(model_config) + for nm in nonmatch: + print("Non-matching:") + print(nm) + + sys.exit(1) diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source0/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_less_source0/config.pbtxt new file mode 100644 index 0000000000..2b784afc77 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source0/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_input_less_source0" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_ELEMENT_COUNT + target_name: "BATCH_INPUT" + data_type: TYPE_FP32 + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source0/expected b/qa/L0_model_config/noautofill_platform/batch_input_less_source0/expected new file mode 100644 index 0000000000..e96bc39270 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source0/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_ELEMENT_COUNT' expects 1 source input, got 0 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source0/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_less_source0/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source0/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source1/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_less_source1/config.pbtxt new file mode 100644 index 0000000000..a7200b4d9c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source1/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_input_less_source1" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_ACCUMULATED_ELEMENT_COUNT + target_name: "BATCH_INPUT" + data_type: TYPE_FP32 + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source1/expected b/qa/L0_model_config/noautofill_platform/batch_input_less_source1/expected new file mode 100644 index 0000000000..1bb8148409 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source1/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_ACCUMULATED_ELEMENT_COUNT' expects 1 source input, got 0 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source1/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_less_source1/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source1/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source2/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_less_source2/config.pbtxt new file mode 100644 index 0000000000..cf13307e0a --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source2/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_input_less_source2" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO + target_name: "BATCH_INPUT" + data_type: TYPE_FP32 + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source2/expected b/qa/L0_model_config/noautofill_platform/batch_input_less_source2/expected new file mode 100644 index 0000000000..d379897ca2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source2/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO' expects 1 source input, got 0 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source2/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_less_source2/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source2/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source3/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_less_source3/config.pbtxt new file mode 100644 index 0000000000..68edf85715 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source3/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_input_less_source3" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_MAX_ELEMENT_COUNT_AS_SHAPE + target_name: "BATCH_INPUT" + data_type: TYPE_FP32 + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source3/expected b/qa/L0_model_config/noautofill_platform/batch_input_less_source3/expected new file mode 100644 index 0000000000..dde21a1ee0 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source3/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_MAX_ELEMENT_COUNT_AS_SHAPE' expects 1 source input, got 0 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_less_source3/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_less_source3/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_less_source3/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source0/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_many_source0/config.pbtxt new file mode 100644 index 0000000000..d24e629a26 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source0/config.pbtxt @@ -0,0 +1,23 @@ +name: "batch_input_many_source0" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_ELEMENT_COUNT + target_name: "BATCH_AND_SIZE_INPUT" + data_type: TYPE_FP32 + source_input: ["INPUT", "INPUT"] + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source0/expected b/qa/L0_model_config/noautofill_platform/batch_input_many_source0/expected new file mode 100644 index 0000000000..36ec328e2b --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source0/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_ELEMENT_COUNT' expects 1 source input, got 2 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source0/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_many_source0/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source0/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source1/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_many_source1/config.pbtxt new file mode 100644 index 0000000000..30805521e4 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source1/config.pbtxt @@ -0,0 +1,23 @@ +name: "batch_input_many_source1" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_ACCUMULATED_ELEMENT_COUNT + target_name: "BATCH_AND_SIZE_INPUT" + data_type: TYPE_FP32 + source_input: ["INPUT", "INPUT"] + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source1/expected b/qa/L0_model_config/noautofill_platform/batch_input_many_source1/expected new file mode 100644 index 0000000000..c193f9987c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source1/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_ACCUMULATED_ELEMENT_COUNT' expects 1 source input, got 2 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source1/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_many_source1/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source1/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source2/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_many_source2/config.pbtxt new file mode 100644 index 0000000000..683bbe7271 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source2/config.pbtxt @@ -0,0 +1,23 @@ +name: "batch_input_many_source2" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO + target_name: "BATCH_AND_SIZE_INPUT" + data_type: TYPE_FP32 + source_input: ["INPUT", "INPUT"] + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source2/expected b/qa/L0_model_config/noautofill_platform/batch_input_many_source2/expected new file mode 100644 index 0000000000..70db5c3352 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source2/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO' expects 1 source input, got 2 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source2/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_many_source2/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source2/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source3/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_many_source3/config.pbtxt new file mode 100644 index 0000000000..2a96b00274 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source3/config.pbtxt @@ -0,0 +1,23 @@ +name: "batch_input_many_source3" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_MAX_ELEMENT_COUNT_AS_SHAPE + target_name: "BATCH_AND_SIZE_INPUT" + data_type: TYPE_FP32 + source_input: ["INPUT", "INPUT"] + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source3/expected b/qa/L0_model_config/noautofill_platform/batch_input_many_source3/expected new file mode 100644 index 0000000000..cf309c2a5a --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source3/expected @@ -0,0 +1 @@ +batch input kind 'BATCH_MAX_ELEMENT_COUNT_AS_SHAPE' expects 1 source input, got 2 diff --git a/qa/L0_model_config/noautofill_platform/batch_input_many_source3/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_many_source3/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_many_source3/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/config.pbtxt new file mode 100644 index 0000000000..dc1cd96637 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/config.pbtxt @@ -0,0 +1,23 @@ +name: "batch_input_unknown_source" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +batch_input [ + { + kind: BATCH_ELEMENT_COUNT + target_name: "BATCH_INPUT" + data_type: TYPE_FP32 + source_input: "UNKNOWN_INPUT" + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/expected b/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/expected new file mode 100644 index 0000000000..56262ba55f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/expected @@ -0,0 +1 @@ +unknown source input name 'UNKNOWN_INPUT' diff --git a/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_input_unknown_source/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/config.pbtxt new file mode 100644 index 0000000000..1024f717b7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_output_duplicated_target" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +batch_output [ + { + target_name: ["OUTPUT", "OUTPUT"] + kind: BATCH_SCATTER_WITH_INPUT_SHAPE + source_input: "INPUT" + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/expected b/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/expected new file mode 100644 index 0000000000..b4fa728bb9 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/expected @@ -0,0 +1 @@ +target output name 'OUTPUT' can only be specified once diff --git a/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_duplicated_target/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_output_less_source/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_output_less_source/config.pbtxt new file mode 100644 index 0000000000..8d9fd6c0e0 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_less_source/config.pbtxt @@ -0,0 +1,21 @@ +name: "batch_output_less_source" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +batch_output [ + { + target_name: "OUTPUT" + kind: BATCH_SCATTER_WITH_INPUT_SHAPE + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_output_less_source/expected b/qa/L0_model_config/noautofill_platform/batch_output_less_source/expected new file mode 100644 index 0000000000..c30339511c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_less_source/expected @@ -0,0 +1 @@ +batch output kind 'BATCH_SCATTER_WITH_INPUT_SHAPE' expects 1 source input, got 0 diff --git a/qa/L0_model_config/noautofill_platform/batch_output_less_source/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_output_less_source/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_less_source/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_output_many_source/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_output_many_source/config.pbtxt new file mode 100644 index 0000000000..859e17f6d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_many_source/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_output_many_source" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +batch_output [ + { + target_name: "OUTPUT" + kind: BATCH_SCATTER_WITH_INPUT_SHAPE + source_input: ["INPUT", "INPUT"] + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_output_many_source/expected b/qa/L0_model_config/noautofill_platform/batch_output_many_source/expected new file mode 100644 index 0000000000..fae05908e7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_many_source/expected @@ -0,0 +1 @@ +batch output kind 'BATCH_SCATTER_WITH_INPUT_SHAPE' expects 1 source input, got 2 diff --git a/qa/L0_model_config/noautofill_platform/batch_output_many_source/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_output_many_source/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_many_source/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/config.pbtxt new file mode 100644 index 0000000000..2e686f329d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_output_unknown_source" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +batch_output [ + { + target_name: "OUTPUT" + kind: BATCH_SCATTER_WITH_INPUT_SHAPE + source_input: "UNKNOWN_INPUT" + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/expected b/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/expected new file mode 100644 index 0000000000..56262ba55f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/expected @@ -0,0 +1 @@ +unknown source input name 'UNKNOWN_INPUT' diff --git a/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_unknown_source/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/config.pbtxt b/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/config.pbtxt new file mode 100644 index 0000000000..28dff8242d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/config.pbtxt @@ -0,0 +1,22 @@ +name: "batch_output_unknown_target" +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +batch_output [ + { + target_name: "UNKNOWN_OUTPUT" + kind: BATCH_SCATTER_WITH_INPUT_SHAPE + source_input: "INPUT" + } +] \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/expected b/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/expected new file mode 100644 index 0000000000..61f1d035e7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/expected @@ -0,0 +1 @@ +unknown target output name 'UNKNOWN_OUTPUT' diff --git a/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/expected_unsupported b/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/expected_unsupported new file mode 100644 index 0000000000..91ffcb84d7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/batch_output_unknown_target/expected_unsupported @@ -0,0 +1 @@ +batch inputs and batch outputs are only supported for custom platform and TensorRT platform diff --git a/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/config.pbtxt b/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/config.pbtxt new file mode 100644 index 0000000000..1fecf68202 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/config.pbtxt @@ -0,0 +1,57 @@ +name: "control_kind_end_multiple" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END0" + control [ + { + kind: CONTROL_SEQUENCE_END + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END1" + control [ + { + kind: CONTROL_SEQUENCE_END + int32_false_true: [ 0, 1 ] + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/expected b/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/expected new file mode 100644 index 0000000000..403e602278 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/expected @@ -0,0 +1 @@ +sequence batching specifies multiple CONTROL_SEQUENCE_END tensors for control_kind_end_multiple \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/expected_ensemble b/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/expected_ensemble new file mode 100644 index 0000000000..8afe2b20b8 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_end_multiple/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble control_kind_end_multiple whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/config.pbtxt b/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/config.pbtxt new file mode 100644 index 0000000000..82f35e2aa0 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/config.pbtxt @@ -0,0 +1,48 @@ +name: "control_kind_ready_multiple" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY0" + control [ + { + kind: CONTROL_SEQUENCE_READY + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY1" + control [ + { + kind: CONTROL_SEQUENCE_READY + int32_false_true: [ 0, 1 ] + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/expected b/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/expected new file mode 100644 index 0000000000..eb8ed9a38d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/expected @@ -0,0 +1 @@ +sequence batching specifies multiple CONTROL_SEQUENCE_READY tensors for control_kind_ready_multiple \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/expected_ensemble b/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/expected_ensemble new file mode 100644 index 0000000000..b1fcdb1756 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_ready_multiple/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble control_kind_ready_multiple whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/config.pbtxt b/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/config.pbtxt new file mode 100644 index 0000000000..83ae70256e --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/config.pbtxt @@ -0,0 +1,39 @@ +name: "control_kind_start_multiple" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/expected b/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/expected new file mode 100644 index 0000000000..f55bb96195 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/expected @@ -0,0 +1 @@ +sequence batching specifies multiple CONTROL_SEQUENCE_START tensors for control_kind_start_multiple \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/expected_ensemble b/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/expected_ensemble new file mode 100644 index 0000000000..160ba8e1f1 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_kind_start_multiple/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble control_kind_start_multiple whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_tensor_multiple/config.pbtxt b/qa/L0_model_config/noautofill_platform/control_tensor_multiple/config.pbtxt new file mode 100644 index 0000000000..a9bcf6680f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_tensor_multiple/config.pbtxt @@ -0,0 +1,39 @@ +name: "control_tensor_multiple" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_READY + int32_false_true: [ 0, 1 ] + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/control_tensor_multiple/expected b/qa/L0_model_config/noautofill_platform/control_tensor_multiple/expected new file mode 100644 index 0000000000..d62e3221ba --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_tensor_multiple/expected @@ -0,0 +1 @@ +sequence batching control tensor 'START' is specified for multiple control kinds for control_tensor_multiple \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_tensor_multiple/expected_ensemble b/qa/L0_model_config/noautofill_platform/control_tensor_multiple/expected_ensemble new file mode 100644 index 0000000000..f58f4da59f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_tensor_multiple/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble control_tensor_multiple whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_tensor_no_value/config.pbtxt b/qa/L0_model_config/noautofill_platform/control_tensor_no_value/config.pbtxt new file mode 100644 index 0000000000..a4763b6f59 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_tensor_no_value/config.pbtxt @@ -0,0 +1,38 @@ +name: "control_tensor_no_value" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + } + ] + } + ] +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/control_tensor_no_value/expected b/qa/L0_model_config/noautofill_platform/control_tensor_no_value/expected new file mode 100644 index 0000000000..1d14957565 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_tensor_no_value/expected @@ -0,0 +1 @@ +sequence batching must specify either 'int32_false_true', 'fp32_false_true' or 'bool_false_true' for CONTROL_SEQUENCE_READY for control_tensor_no_value \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/control_tensor_no_value/expected_ensemble b/qa/L0_model_config/noautofill_platform/control_tensor_no_value/expected_ensemble new file mode 100644 index 0000000000..ae9a8db64b --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/control_tensor_no_value/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble control_tensor_no_value whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/default_priority_level0/config.pbtxt b/qa/L0_model_config/noautofill_platform/default_priority_level0/config.pbtxt new file mode 100644 index 0000000000..7c29e74637 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/default_priority_level0/config.pbtxt @@ -0,0 +1,20 @@ +name: "default_priority_level0" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +dynamic_batching { + priority_levels: 3 +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/default_priority_level0/expected b/qa/L0_model_config/noautofill_platform/default_priority_level0/expected new file mode 100644 index 0000000000..c30877f08b --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/default_priority_level0/expected @@ -0,0 +1 @@ +default priority level must be in range \[1, 3\] for default_priority_level0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/default_priority_level0/expected_ensemble b/qa/L0_model_config/noautofill_platform/default_priority_level0/expected_ensemble new file mode 100644 index 0000000000..95f1e1950e --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/default_priority_level0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble default_priority_level0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/default_priority_level1/config.pbtxt b/qa/L0_model_config/noautofill_platform/default_priority_level1/config.pbtxt new file mode 100644 index 0000000000..87cffd5c84 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/default_priority_level1/config.pbtxt @@ -0,0 +1,21 @@ +name: "default_priority_level1" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +dynamic_batching { + priority_levels: 3 + default_priority_level: 5 +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/default_priority_level1/expected b/qa/L0_model_config/noautofill_platform/default_priority_level1/expected new file mode 100644 index 0000000000..b1ff9725f4 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/default_priority_level1/expected @@ -0,0 +1 @@ +default priority level must be in range \[1, 3\] for default_priority_level1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/default_priority_level1/expected_ensemble b/qa/L0_model_config/noautofill_platform/default_priority_level1/expected_ensemble new file mode 100644 index 0000000000..7d661dbad5 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/default_priority_level1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble default_priority_level1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/ensemble_scheduling_set/config.pbtxt b/qa/L0_model_config/noautofill_platform/ensemble_scheduling_set/config.pbtxt new file mode 100644 index 0000000000..21e7272482 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/ensemble_scheduling_set/config.pbtxt @@ -0,0 +1,39 @@ +name: "ensemble_scheduling_set" +max_batch_size: 8 +ensemble_scheduling { + step [ + { + model_name: "model_a" + model_version: -1 + input_map { + key: "model_a_input" + value: "data" + } + output_map { + key: "model_a_output" + value: "prob" + } + } + ] +} +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] +instance_group [ + { + kind: KIND_GPU + gpus: [ 42 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/ensemble_scheduling_set/expected b/qa/L0_model_config/noautofill_platform/ensemble_scheduling_set/expected new file mode 100644 index 0000000000..615b25f02d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/ensemble_scheduling_set/expected @@ -0,0 +1 @@ +ensemble scheduling cannot be set for model 'ensemble_scheduling_set' whose platform is not ensemble \ No newline at end of file diff --git a/src/test/testdata/model_config_sanity/invalid_cpu/config.pbtxt b/qa/L0_model_config/noautofill_platform/invalid_cpu/config.pbtxt similarity index 100% rename from src/test/testdata/model_config_sanity/invalid_cpu/config.pbtxt rename to qa/L0_model_config/noautofill_platform/invalid_cpu/config.pbtxt diff --git a/qa/L0_model_config/noautofill_platform/invalid_cpu/expected b/qa/L0_model_config/noautofill_platform/invalid_cpu/expected new file mode 100644 index 0000000000..6a9d6b6c5a --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/invalid_cpu/expected @@ -0,0 +1 @@ +instance group invalid_cpu_0 of model invalid_cpu has kind KIND_CPU but specifies one or more GPU \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/invalid_cpu/expected_ensemble b/qa/L0_model_config/noautofill_platform/invalid_cpu/expected_ensemble new file mode 100644 index 0000000000..efb14622f6 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/invalid_cpu/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble invalid_cpu whose platform is ensemble \ No newline at end of file diff --git a/src/test/testdata/model_config_sanity/invalid_gpu/config.pbtxt b/qa/L0_model_config/noautofill_platform/invalid_gpu/config.pbtxt similarity index 100% rename from src/test/testdata/model_config_sanity/invalid_gpu/config.pbtxt rename to qa/L0_model_config/noautofill_platform/invalid_gpu/config.pbtxt diff --git a/qa/L0_model_config/noautofill_platform/invalid_gpu/expected b/qa/L0_model_config/noautofill_platform/invalid_gpu/expected new file mode 100644 index 0000000000..f679d70920 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/invalid_gpu/expected @@ -0,0 +1 @@ +instance group invalid_gpu_0 of model invalid_gpu specifies invalid or unsupported gpu id 42. GPUs with at least the minimum required CUDA compute compatibility \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/invalid_gpu/expected_ensemble b/qa/L0_model_config/noautofill_platform/invalid_gpu/expected_ensemble new file mode 100644 index 0000000000..ff29961e3c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/invalid_gpu/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble invalid_gpu whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/missing_datatype/config.pbtxt b/qa/L0_model_config/noautofill_platform/missing_datatype/config.pbtxt new file mode 100644 index 0000000000..4a348dcab2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/missing_datatype/config.pbtxt @@ -0,0 +1,15 @@ +name: "missing_datatype" +max_batch_size: 4 +input [ + { + name: "input" + dims: [ 2, -1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/missing_datatype/expected b/qa/L0_model_config/noautofill_platform/missing_datatype/expected new file mode 100644 index 0000000000..8a8aefd98a --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/missing_datatype/expected @@ -0,0 +1 @@ +model input 'input' must specify 'data_type' \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/missing_datatype/expected_ensemble b/qa/L0_model_config/noautofill_platform/missing_datatype/expected_ensemble new file mode 100644 index 0000000000..1427e7d73b --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/missing_datatype/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble missing_datatype whose platform is ensemble \ No newline at end of file diff --git a/src/test/testdata/model_config_sanity/negative_gpu/config.pbtxt b/qa/L0_model_config/noautofill_platform/negative_gpu/config.pbtxt similarity index 100% rename from src/test/testdata/model_config_sanity/negative_gpu/config.pbtxt rename to qa/L0_model_config/noautofill_platform/negative_gpu/config.pbtxt diff --git a/qa/L0_model_config/noautofill_platform/negative_gpu/expected b/qa/L0_model_config/noautofill_platform/negative_gpu/expected new file mode 100644 index 0000000000..ad134739c8 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/negative_gpu/expected @@ -0,0 +1 @@ +instance group negative_gpu_1 of model negative_gpu specifies invalid or unsupported gpu id -1. GPUs with at least the minimum required CUDA compute compatibility \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/negative_gpu/expected_ensemble b/qa/L0_model_config/noautofill_platform/negative_gpu/expected_ensemble new file mode 100644 index 0000000000..ba0aa8770f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/negative_gpu/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble negative_gpu whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/negative_max_batch_size/config.pbtxt b/qa/L0_model_config/noautofill_platform/negative_max_batch_size/config.pbtxt new file mode 100644 index 0000000000..11baf44450 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/negative_max_batch_size/config.pbtxt @@ -0,0 +1,17 @@ +name: "negative_max_batch_size" +max_batch_size: -2 +input [ + { + name: "data" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 1, 28, 28 ] + } +] +output [ + { + name: "prob" + data_type: TYPE_FP32 + dims: [ 10, 1, 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/negative_max_batch_size/expected b/qa/L0_model_config/noautofill_platform/negative_max_batch_size/expected new file mode 100644 index 0000000000..4d8ef43082 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/negative_max_batch_size/expected @@ -0,0 +1 @@ +'max_batch_size' must be non-negative value for negative_max_batch_size \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/negative_max_batch_size/expected_ensemble b/qa/L0_model_config/noautofill_platform/negative_max_batch_size/expected_ensemble new file mode 100644 index 0000000000..41bc3f0c25 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/negative_max_batch_size/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble negative_max_batch_size whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering0/config.pbtxt b/qa/L0_model_config/noautofill_platform/preserve_ordering0/config.pbtxt new file mode 100644 index 0000000000..3d7c0a6fd9 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering0/config.pbtxt @@ -0,0 +1,27 @@ +name: "preserve_ordering0" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +dynamic_batching { + preserve_ordering: true + priority_levels: 3 + default_priority_level: 2 + priority_queue_policy { + key: 1 + value: { + } + } +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering0/expected b/qa/L0_model_config/noautofill_platform/preserve_ordering0/expected new file mode 100644 index 0000000000..4b1638966f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering0/expected @@ -0,0 +1 @@ +Only one priority level is allowed when 'preserve_ordering' is true for preserve_ordering0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering0/expected_ensemble b/qa/L0_model_config/noautofill_platform/preserve_ordering0/expected_ensemble new file mode 100644 index 0000000000..d1e724ef2d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble preserve_ordering0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering1/config.pbtxt b/qa/L0_model_config/noautofill_platform/preserve_ordering1/config.pbtxt new file mode 100644 index 0000000000..82a8948817 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering1/config.pbtxt @@ -0,0 +1,24 @@ +name: "preserve_ordering1" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +dynamic_batching { + preserve_ordering: true + default_queue_policy { + timeout_action: DELAY + default_timeout_microseconds: 1000 + } +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering1/expected b/qa/L0_model_config/noautofill_platform/preserve_ordering1/expected new file mode 100644 index 0000000000..f9b014ad85 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering1/expected @@ -0,0 +1 @@ +Queue policy can not have DELAY as timeout action when 'preserve_ordering' is true for preserve_ordering1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering1/expected_ensemble b/qa/L0_model_config/noautofill_platform/preserve_ordering1/expected_ensemble new file mode 100644 index 0000000000..695852a01f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble preserve_ordering1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering2/config.pbtxt b/qa/L0_model_config/noautofill_platform/preserve_ordering2/config.pbtxt new file mode 100644 index 0000000000..35ff408967 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering2/config.pbtxt @@ -0,0 +1,29 @@ +name: "preserve_ordering2" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +dynamic_batching { + preserve_ordering: true + priority_levels: 1 + default_priority_level: 1 + priority_queue_policy { + key: 1 + value: { + timeout_action: DELAY + default_timeout_microseconds: 1000 + } + } +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering2/expected b/qa/L0_model_config/noautofill_platform/preserve_ordering2/expected new file mode 100644 index 0000000000..5db30b45fd --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering2/expected @@ -0,0 +1 @@ +Queue policy can not have DELAY as timeout action when 'preserve_ordering' is true for preserve_ordering2 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/preserve_ordering2/expected_ensemble b/qa/L0_model_config/noautofill_platform/preserve_ordering2/expected_ensemble new file mode 100644 index 0000000000..8c0ec8a2c2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/preserve_ordering2/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble preserve_ordering2 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/priority_level0/config.pbtxt b/qa/L0_model_config/noautofill_platform/priority_level0/config.pbtxt new file mode 100644 index 0000000000..7167b043d2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/priority_level0/config.pbtxt @@ -0,0 +1,26 @@ +name: "priority_level0" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +dynamic_batching { + priority_levels: 3 + default_priority_level: 2 + priority_queue_policy { + key: 0 + value: { + } + } +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/priority_level0/expected b/qa/L0_model_config/noautofill_platform/priority_level0/expected new file mode 100644 index 0000000000..ae22d2f5b2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/priority_level0/expected @@ -0,0 +1 @@ +priority queue policy must have priority level in range \[1, 3\] for priority_level0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/priority_level0/expected_ensemble b/qa/L0_model_config/noautofill_platform/priority_level0/expected_ensemble new file mode 100644 index 0000000000..8fce1c7a95 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/priority_level0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble priority_level0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/priority_level1/config.pbtxt b/qa/L0_model_config/noautofill_platform/priority_level1/config.pbtxt new file mode 100644 index 0000000000..aa274bd9b3 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/priority_level1/config.pbtxt @@ -0,0 +1,26 @@ +name: "priority_level1" +platform: "tensorflow_savedmodel" +max_batch_size: 8 +dynamic_batching { + priority_levels: 3 + default_priority_level: 2 + priority_queue_policy { + key: 4 + value: { + } + } +} +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/priority_level1/expected b/qa/L0_model_config/noautofill_platform/priority_level1/expected new file mode 100644 index 0000000000..21b8ae0c66 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/priority_level1/expected @@ -0,0 +1 @@ +priority queue policy must have priority level in range \[1, 3\] for priority_level1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/priority_level1/expected_ensemble b/qa/L0_model_config/noautofill_platform/priority_level1/expected_ensemble new file mode 100644 index 0000000000..ee97f91f68 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/priority_level1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble priority_level1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount0/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_elementcount0/config.pbtxt new file mode 100644 index 0000000000..c89d9b7f54 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount0/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_elementcount0" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1, 3, 2 ] + reshape { shape: [ 5 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount0/expected b/qa/L0_model_config/noautofill_platform/reshape_elementcount0/expected new file mode 100644 index 0000000000..b64dd448fd --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount0/expected @@ -0,0 +1 @@ +model input 'input' has different size for dims and reshape for reshape_elementcount0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount0/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_elementcount0/expected_ensemble new file mode 100644 index 0000000000..a615097e86 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_elementcount0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount1/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_elementcount1/config.pbtxt new file mode 100644 index 0000000000..90a32cb647 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount1/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_elementcount1" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 15 ] + reshape { shape: [ 2, 1, 5 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount1/expected b/qa/L0_model_config/noautofill_platform/reshape_elementcount1/expected new file mode 100644 index 0000000000..042d5381d2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount1/expected @@ -0,0 +1 @@ +model input 'input' has different size for dims and reshape for reshape_elementcount1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount1/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_elementcount1/expected_ensemble new file mode 100644 index 0000000000..4a89e78a5a --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_elementcount1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount2/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_elementcount2/config.pbtxt new file mode 100644 index 0000000000..5268958e58 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount2/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_elementcount2" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1, 3, 2 ] + reshape { shape: [ 3 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount2/expected b/qa/L0_model_config/noautofill_platform/reshape_elementcount2/expected new file mode 100644 index 0000000000..e4ea6faf79 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount2/expected @@ -0,0 +1 @@ +model output 'output' has different size for dims and reshape for reshape_elementcount2 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount2/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_elementcount2/expected_ensemble new file mode 100644 index 0000000000..9bba6b5fdd --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount2/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_elementcount2 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount3/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_elementcount3/config.pbtxt new file mode 100644 index 0000000000..587a96b8dc --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount3/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_elementcount3" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 15 ] + reshape { shape: [ 3, 2, 5 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount3/expected b/qa/L0_model_config/noautofill_platform/reshape_elementcount3/expected new file mode 100644 index 0000000000..c02af51e00 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount3/expected @@ -0,0 +1 @@ +model output 'output' has different size for dims and reshape for reshape_elementcount3 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_elementcount3/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_elementcount3/expected_ensemble new file mode 100644 index 0000000000..82dac72f9f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_elementcount3/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_elementcount3 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/config.pbtxt new file mode 100644 index 0000000000..353eb7d4ca --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_empty0" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape { shape: [ ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/expected new file mode 100644 index 0000000000..84371a2d96 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/expected @@ -0,0 +1 @@ +model input 'input' cannot have empty reshape for non-batching model as scalar tensors are not supported for reshape_nobatch_empty0 diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/expected_ensemble new file mode 100644 index 0000000000..bd9a79b78c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_empty0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/config.pbtxt new file mode 100644 index 0000000000..e75e5c0d08 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_empty1" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape { shape: [ ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/expected new file mode 100644 index 0000000000..e904ce6e2e --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/expected @@ -0,0 +1 @@ +model output 'output' cannot have empty reshape for non-batching model as scalar tensors are not supported for reshape_nobatch_empty1 diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/expected_ensemble new file mode 100644 index 0000000000..57f7cd9477 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_empty1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_empty1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/config.pbtxt new file mode 100644 index 0000000000..05e58808ce --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_variable0" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, 2 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/expected new file mode 100644 index 0000000000..f876c0affd --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/expected @@ -0,0 +1 @@ +model input 'input' has different size for dims and reshape for reshape_nobatch_variable0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/expected_ensemble new file mode 100644 index 0000000000..83fad14994 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_variable0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/config.pbtxt new file mode 100644 index 0000000000..0a821e956c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_variable1" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, 2 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/expected new file mode 100644 index 0000000000..6a31d30e6c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/expected @@ -0,0 +1 @@ +model output 'output' has different size for dims and reshape for reshape_nobatch_variable1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/expected_ensemble new file mode 100644 index 0000000000..e4b07427c3 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_variable1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/config.pbtxt new file mode 100644 index 0000000000..7fdf90005c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_variable2" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, -1, 2 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/expected new file mode 100644 index 0000000000..e5b2b04cbe --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/expected @@ -0,0 +1 @@ +model input 'input' has different size for dims and reshape for reshape_nobatch_variable2 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/expected_ensemble new file mode 100644 index 0000000000..7dae5f5eb2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable2/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_variable2 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/config.pbtxt new file mode 100644 index 0000000000..052262b68d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_variable3" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 1, -1, 2 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/expected new file mode 100644 index 0000000000..0aae4b7f6d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/expected @@ -0,0 +1 @@ +model output 'output' has different size for dims and reshape for reshape_nobatch_variable3 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/expected_ensemble new file mode 100644 index 0000000000..bd5536b49e --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable3/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_variable3 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/config.pbtxt new file mode 100644 index 0000000000..206567621c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_variable4" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, -1, -1 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/expected new file mode 100644 index 0000000000..91444c4024 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/expected @@ -0,0 +1 @@ +model input 'input' has different number of variable-size dimensions for dims and reshape for reshape_nobatch_variable4 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/expected_ensemble new file mode 100644 index 0000000000..9c768459fd --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable4/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_variable4 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/config.pbtxt new file mode 100644 index 0000000000..f7eb036612 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_variable5" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, -1, -1 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/expected new file mode 100644 index 0000000000..43d172d57c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/expected @@ -0,0 +1 @@ +model output 'output' has different number of variable-size dimensions for dims and reshape for reshape_nobatch_variable5 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/expected_ensemble new file mode 100644 index 0000000000..fcc60c4ec7 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_variable5/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_variable5 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/config.pbtxt new file mode 100644 index 0000000000..5a574ed09f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_zerodims0" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, 2 ] + reshape { shape: [ 0 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/expected new file mode 100644 index 0000000000..d22ceeb44d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/expected @@ -0,0 +1 @@ +model input 'input' reshape dimensions must be integer >= 1, or -1 to indicate a variable-size dimension for reshape_nobatch_zerodims0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/expected_ensemble new file mode 100644 index 0000000000..1605054be0 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_zerodims0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/config.pbtxt new file mode 100644 index 0000000000..0e701ac6db --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_nobatch_zerodims1" +max_batch_size: 0 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, 2 ] + reshape { shape: [ 0 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/expected b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/expected new file mode 100644 index 0000000000..63910d9cb8 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/expected @@ -0,0 +1 @@ +model output 'output' reshape dimensions must be integer >= 1, or -1 to indicate a variable-size dimension for reshape_nobatch_zerodims1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/expected_ensemble new file mode 100644 index 0000000000..7859d26451 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_nobatch_zerodims1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_nobatch_zerodims1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable0/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_variable0/config.pbtxt new file mode 100644 index 0000000000..1e34af076b --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable0/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_variable0" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, 2 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable0/expected b/qa/L0_model_config/noautofill_platform/reshape_variable0/expected new file mode 100644 index 0000000000..05b8211905 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable0/expected @@ -0,0 +1 @@ +model input 'input' has different size for dims and reshape for reshape_variable0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable0/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_variable0/expected_ensemble new file mode 100644 index 0000000000..6c9a6a26ce --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_variable0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable1/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_variable1/config.pbtxt new file mode 100644 index 0000000000..7c6b9d7002 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable1/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_variable1" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, 2 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable1/expected b/qa/L0_model_config/noautofill_platform/reshape_variable1/expected new file mode 100644 index 0000000000..0c875bd012 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable1/expected @@ -0,0 +1 @@ +model output 'output' has different size for dims and reshape for reshape_variable1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable1/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_variable1/expected_ensemble new file mode 100644 index 0000000000..ba3d55612f --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_variable1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable2/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_variable2/config.pbtxt new file mode 100644 index 0000000000..0d89113b1a --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable2/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_variable2" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, -1, 2 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable2/expected b/qa/L0_model_config/noautofill_platform/reshape_variable2/expected new file mode 100644 index 0000000000..b62b5a92a5 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable2/expected @@ -0,0 +1 @@ +model input 'input' has different size for dims and reshape for reshape_variable2 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable2/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_variable2/expected_ensemble new file mode 100644 index 0000000000..d81ab1a703 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable2/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_variable2 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable3/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_variable3/config.pbtxt new file mode 100644 index 0000000000..48b28aab70 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable3/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_variable3" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 1, -1, 2 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable3/expected b/qa/L0_model_config/noautofill_platform/reshape_variable3/expected new file mode 100644 index 0000000000..c012504872 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable3/expected @@ -0,0 +1 @@ +model output 'output' has different size for dims and reshape for reshape_variable3 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable3/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_variable3/expected_ensemble new file mode 100644 index 0000000000..7f7c79c695 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable3/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_variable3 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable4/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_variable4/config.pbtxt new file mode 100644 index 0000000000..71f2d71221 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable4/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_variable4" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, -1, -1 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable4/expected b/qa/L0_model_config/noautofill_platform/reshape_variable4/expected new file mode 100644 index 0000000000..d5bde03ccc --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable4/expected @@ -0,0 +1 @@ +model input 'input' has different number of variable-size dimensions for dims and reshape for reshape_variable4 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable4/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_variable4/expected_ensemble new file mode 100644 index 0000000000..5bffc461a6 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable4/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_variable4 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable5/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_variable5/config.pbtxt new file mode 100644 index 0000000000..cf4dbe3b5e --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable5/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_variable5" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, -1 ] + reshape { shape: [ 2, -1, -1 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable5/expected b/qa/L0_model_config/noautofill_platform/reshape_variable5/expected new file mode 100644 index 0000000000..4af30e17df --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable5/expected @@ -0,0 +1 @@ +model output 'output' has different number of variable-size dimensions for dims and reshape for reshape_variable5 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_variable5/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_variable5/expected_ensemble new file mode 100644 index 0000000000..75b2e73278 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_variable5/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_variable5 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_zerodims0/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_zerodims0/config.pbtxt new file mode 100644 index 0000000000..49365e0dd9 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_zerodims0/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_zerodims0" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 2, 2 ] + reshape { shape: [ 0 ] } + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_zerodims0/expected b/qa/L0_model_config/noautofill_platform/reshape_zerodims0/expected new file mode 100644 index 0000000000..8d178e917c --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_zerodims0/expected @@ -0,0 +1 @@ +model input 'input' reshape dimensions must be integer >= 1, or -1 to indicate a variable-size dimension for reshape_zerodims0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_zerodims0/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_zerodims0/expected_ensemble new file mode 100644 index 0000000000..fb2078fb98 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_zerodims0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_zerodims0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_zerodims1/config.pbtxt b/qa/L0_model_config/noautofill_platform/reshape_zerodims1/config.pbtxt new file mode 100644 index 0000000000..f5b6078810 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_zerodims1/config.pbtxt @@ -0,0 +1,17 @@ +name: "reshape_zerodims1" +max_batch_size: 4 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 2, 2 ] + reshape { shape: [ 0 ] } + } +] diff --git a/qa/L0_model_config/noautofill_platform/reshape_zerodims1/expected b/qa/L0_model_config/noautofill_platform/reshape_zerodims1/expected new file mode 100644 index 0000000000..eed06a761b --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_zerodims1/expected @@ -0,0 +1 @@ +model output 'output' reshape dimensions must be integer >= 1, or -1 to indicate a variable-size dimension for reshape_zerodims1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/reshape_zerodims1/expected_ensemble b/qa/L0_model_config/noautofill_platform/reshape_zerodims1/expected_ensemble new file mode 100644 index 0000000000..991bdc9f63 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/reshape_zerodims1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble reshape_zerodims1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_input0/config.pbtxt b/qa/L0_model_config/noautofill_platform/zerodims_input0/config.pbtxt new file mode 100644 index 0000000000..e6e392d661 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_input0/config.pbtxt @@ -0,0 +1,16 @@ +name: "zerodims_input0" +max_batch_size: 8 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1, 0, 28 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/zerodims_input0/expected b/qa/L0_model_config/noautofill_platform/zerodims_input0/expected new file mode 100644 index 0000000000..174ee03eb3 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_input0/expected @@ -0,0 +1 @@ +model input 'input' dimension must be integer >= 1, or -1 to indicate a variable-size dimension for zerodims_input0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_input0/expected_ensemble b/qa/L0_model_config/noautofill_platform/zerodims_input0/expected_ensemble new file mode 100644 index 0000000000..951c9f0f97 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_input0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble zerodims_input0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_input1/config.pbtxt b/qa/L0_model_config/noautofill_platform/zerodims_input1/config.pbtxt new file mode 100644 index 0000000000..bab31346ad --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_input1/config.pbtxt @@ -0,0 +1,16 @@ +name: "zerodims_input1" +max_batch_size: 8 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 0 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/zerodims_input1/expected b/qa/L0_model_config/noautofill_platform/zerodims_input1/expected new file mode 100644 index 0000000000..d2d38d65f2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_input1/expected @@ -0,0 +1 @@ +model input 'input' dimension must be integer >= 1, or -1 to indicate a variable-size dimension for zerodims_input1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_input1/expected_ensemble b/qa/L0_model_config/noautofill_platform/zerodims_input1/expected_ensemble new file mode 100644 index 0000000000..0e80f84ef0 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_input1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble zerodims_input1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_output0/config.pbtxt b/qa/L0_model_config/noautofill_platform/zerodims_output0/config.pbtxt new file mode 100644 index 0000000000..f7d0a3cc7d --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_output0/config.pbtxt @@ -0,0 +1,16 @@ +name: "zerodims_output0" +max_batch_size: 8 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1, 28 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 1, 1, 0 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/zerodims_output0/expected b/qa/L0_model_config/noautofill_platform/zerodims_output0/expected new file mode 100644 index 0000000000..1827f6ec41 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_output0/expected @@ -0,0 +1 @@ +model output 'output' dimension must be integer >= 1, or -1 to indicate a variable-size dimension for zerodims_output0 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_output0/expected_ensemble b/qa/L0_model_config/noautofill_platform/zerodims_output0/expected_ensemble new file mode 100644 index 0000000000..d2c5d0c5f2 --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_output0/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble zerodims_output0 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_output1/config.pbtxt b/qa/L0_model_config/noautofill_platform/zerodims_output1/config.pbtxt new file mode 100644 index 0000000000..5346e0794e --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_output1/config.pbtxt @@ -0,0 +1,16 @@ +name: "zerodims_output1" +max_batch_size: 8 +input [ + { + name: "input" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "output" + data_type: TYPE_FP32 + dims: [ 0 ] + } +] diff --git a/qa/L0_model_config/noautofill_platform/zerodims_output1/expected b/qa/L0_model_config/noautofill_platform/zerodims_output1/expected new file mode 100644 index 0000000000..9c4836f3ee --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_output1/expected @@ -0,0 +1 @@ +model output 'output' dimension must be integer >= 1, or -1 to indicate a variable-size dimension for zerodims_output1 \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_platform/zerodims_output1/expected_ensemble b/qa/L0_model_config/noautofill_platform/zerodims_output1/expected_ensemble new file mode 100644 index 0000000000..92554ebcbd --- /dev/null +++ b/qa/L0_model_config/noautofill_platform/zerodims_output1/expected_ensemble @@ -0,0 +1 @@ +ensemble scheduling must be set for ensemble zerodims_output1 whose platform is ensemble \ No newline at end of file diff --git a/qa/L0_model_config/noautofill_test.py b/qa/L0_model_config/noautofill_test.py new file mode 100755 index 0000000000..d89e306eb8 --- /dev/null +++ b/qa/L0_model_config/noautofill_test.py @@ -0,0 +1,62 @@ +#!/usr/bin/python +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import test_util as tu +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + + +class NoAutoFillTest(tu.TestResultCollector): + def setUp(self): + self._model_name = "noautofill_noconfig" + self._triton_client = httpclient.InferenceServerClient("localhost:8000") + + def tearDown(self): + self._triton_client.unload_model(self._model_name) + + def test_load_no_autofill_model_with_config(self): + config = '{"max_batch_size":"16"}' + self._triton_client.load_model(self._model_name, config=config) + + # Check if the model config is correct + model_config = self._triton_client.get_model_config(self._model_name) + self.assertEqual(model_config["max_batch_size"], 16) + + def test_load_no_autofill_model_with_no_config(self): + with self.assertRaises(InferenceServerException) as ex: + self._triton_client.load_model(self._model_name) + self.assertIn("model configuration is not provided", str(ex.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_model_config/special_cases/invalid_platform/config.pbtxt b/qa/L0_model_config/special_cases/invalid_platform/config.pbtxt new file mode 100644 index 0000000000..6cdb34f1c0 --- /dev/null +++ b/qa/L0_model_config/special_cases/invalid_platform/config.pbtxt @@ -0,0 +1,29 @@ +name: "invalid_platform" +platform: "tensorflo" +default_model_filename: "model.savedmodel" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + label_filename: "output0_labels.txt" + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] diff --git a/qa/L0_model_config/special_cases/invalid_platform/expected b/qa/L0_model_config/special_cases/invalid_platform/expected new file mode 100644 index 0000000000..4be7d00ff0 --- /dev/null +++ b/qa/L0_model_config/special_cases/invalid_platform/expected @@ -0,0 +1 @@ +unexpected platform type 'tensorflo' for invalid_platform diff --git a/qa/L0_model_config/special_cases/invalid_runtime/config.pbtxt b/qa/L0_model_config/special_cases/invalid_runtime/config.pbtxt new file mode 100644 index 0000000000..492ff9094f --- /dev/null +++ b/qa/L0_model_config/special_cases/invalid_runtime/config.pbtxt @@ -0,0 +1,23 @@ +name: "invalid_runtime" +max_batch_size: 2 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] +runtime: "__invalid_runtime__" diff --git a/qa/L0_model_config/special_cases/invalid_runtime/expected b/qa/L0_model_config/special_cases/invalid_runtime/expected new file mode 100644 index 0000000000..2570c6f2d1 --- /dev/null +++ b/qa/L0_model_config/special_cases/invalid_runtime/expected @@ -0,0 +1 @@ +unable to find backend library '__invalid_runtime__' for model 'invalid_runtime' diff --git a/qa/L0_model_config/special_cases/noautofill_noconfig/expected b/qa/L0_model_config/special_cases/noautofill_noconfig/expected new file mode 100644 index 0000000000..5a0abf84dc --- /dev/null +++ b/qa/L0_model_config/special_cases/noautofill_noconfig/expected @@ -0,0 +1 @@ +model configuration is not provided diff --git a/qa/L0_model_config/special_cases/runtime_escape/config.pbtxt b/qa/L0_model_config/special_cases/runtime_escape/config.pbtxt new file mode 100644 index 0000000000..8365e2d14a --- /dev/null +++ b/qa/L0_model_config/special_cases/runtime_escape/config.pbtxt @@ -0,0 +1,23 @@ +name: "runtime_escape" +max_batch_size: 2 +backend: "identity" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] +runtime: "../dummy_runtime/libtriton_identity.so" diff --git a/qa/L0_model_config/special_cases/runtime_escape/expected b/qa/L0_model_config/special_cases/runtime_escape/expected new file mode 100644 index 0000000000..7c2506c692 --- /dev/null +++ b/qa/L0_model_config/special_cases/runtime_escape/expected @@ -0,0 +1 @@ +backend library name '../dummy_runtime/libtriton_identity.so' escapes backend directory diff --git a/qa/L0_model_config/test.sh b/qa/L0_model_config/test.sh new file mode 100755 index 0000000000..55133e69d9 --- /dev/null +++ b/qa/L0_model_config/test.sh @@ -0,0 +1,640 @@ +#!/bin/bash +# Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +CLIENT_LOG="./client.log" +CLIENT=model_config_test.py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_TIMEOUT=20 +SERVER_LOG_BASE="./inference_server" +source ../common/util.sh + +export CUDA_VISIBLE_DEVICES=0 + +TRIALS="tensorflow_savedmodel tensorflow_graphdef tensorrt_plan onnxruntime_onnx pytorch_libtorch" + +# Copy fixed TensorRT plans into the test model repositories. +for modelpath in \ + autofill_noplatform/tensorrt/bad_input_dims/1 \ + autofill_noplatform/tensorrt/bad_input_shape/1 \ + autofill_noplatform/tensorrt/bad_input_type/1 \ + autofill_noplatform/tensorrt/bad_input_shape_tensor/1 \ + autofill_noplatform/tensorrt/bad_input_non_linear_format_io/1 \ + autofill_noplatform/tensorrt/bad_output_dims/1 \ + autofill_noplatform/tensorrt/bad_output_shape/1 \ + autofill_noplatform/tensorrt/bad_output_type/1 \ + autofill_noplatform/tensorrt/bad_output_shape_tensor/1 \ + autofill_noplatform/tensorrt/bad_outut_non_linear_format_io/1 \ + autofill_noplatform/tensorrt/too_few_inputs/1 \ + autofill_noplatform/tensorrt/too_many_inputs/1 \ + autofill_noplatform/tensorrt/unknown_input/1 \ + autofill_noplatform/tensorrt/unknown_output/1 \ + autofill_noplatform_success/tensorrt/no_name_platform/1 \ + autofill_noplatform_success/tensorrt/empty_config/1 \ + autofill_noplatform_success/tensorrt/no_config/1 \ + autofill_noplatform_success/tensorrt/incomplete_input/1 \ + autofill_noplatform_success/tensorrt/incomplete_output/1 ; do + mkdir -p $modelpath + cp /data/inferenceserver/${REPO_VERSION}/qa_model_repository/plan_float32_float32_float32/1/model.plan \ + $modelpath/. + + # Create a dummy file which must be ignored. This test is only needed + # for TensorRT autofiller as it is the last backend that attempts to + # load the files provided in the version directory. Essentially, + # for autofiller of other backends, a TensorRT plan would behave + # like this dummy file. + echo "dummy_content" >> $modelpath/dummy_file.txt +done + + +# Copy TensorRT plans with shape tensor into the test model repositories. +for modelpath in \ + autofill_noplatform/tensorrt/mixed_batch_hint_dims/1 \ + autofill_noplatform/tensorrt/mixed_batch_hint_shape_values/1 \ + autofill_noplatform_success/tensorrt/no_config_shape_tensor/1 ; do + mkdir -p $modelpath + cp /data/inferenceserver/${REPO_VERSION}/qa_shapetensor_model_repository/plan_zero_1_float32_int32/1/model.plan \ + $modelpath/. +done + +# Copy TensorRT plans with non-linear format IO into the test model repositories. +for modelpath in \ + autofill_noplatform_success/tensorrt/no_config_non_linear_format_io/1 ; do + mkdir -p $modelpath + cp /data/inferenceserver/${REPO_VERSION}/qa_trt_format_model_repository/plan_CHW32_LINEAR_float32_float32_float32/1/model.plan \ + $modelpath/. +done + +# Copy variable-sized TensorRT plans into the test model repositories. +for modelpath in \ + autofill_noplatform_success/tensorrt/no_name_platform_variable/1 \ + autofill_noplatform_success/tensorrt/empty_config_variable/1 \ + autofill_noplatform_success/tensorrt/no_config_variable/1 \ + autofill_noplatform_success/tensorrt/hint_for_no_batch/1 \ + autofill_noplatform_success/tensorrt/multi_prof_max_bs/1 ; do + mkdir -p $modelpath + cp /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_float32_float32_float32/1/model.plan \ + $modelpath/. +done + +for modelpath in \ + autofill_noplatform/tensorrt/bad_dynamic_shapes_max/1 \ + autofill_noplatform/tensorrt/bad_dynamic_shapes_min/1 ; do + mkdir -p $modelpath + cp /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_float32_float32_float32-4-32/1/model.plan \ + $modelpath/. +done + +for modelpath in \ + autofill_noplatform/ensemble/invalid_input_map/invalid_input_map/1 \ + autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4/1 \ + autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_input4/1 \ + autofill_noplatform/ensemble/invalid_input_map/fp32_dim1_batch4_output3/1 \ + autofill_noplatform/ensemble/invalid_output_map/invalid_output_map/1 \ + autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4/1 \ + autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_input4/1 \ + autofill_noplatform/ensemble/invalid_output_map/fp32_dim1_batch4_output3/1 \ + autofill_noplatform/ensemble/invalid_batch_size/invalid_batch_size/1 \ + autofill_noplatform/ensemble/invalid_batch_size/invalid_batch_size/1 \ + autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch2/1 \ + autofill_noplatform/ensemble/invalid_batch_size/fp32_dim1_batch4/1 \ + autofill_noplatform/ensemble/invalid_decoupled_branching/invalid_decoupled_branching/1 \ + autofill_noplatform/ensemble/invalid_decoupled_branching/int32_dim1_nobatch_output2/1 \ + autofill_noplatform/ensemble/invalid_decoupled_branching_2/invalid_decoupled_branching_2/1 \ + autofill_noplatform/ensemble/inconsistent_shape/inconsistent_shape/1 \ + autofill_noplatform/ensemble/inconsistent_shape/fp32_dim1_batch4/1 \ + autofill_noplatform/ensemble/inconsistent_shape/fp32_dim3_batch4/1 \ + autofill_noplatform/ensemble/inconsistent_data_type/inconsistent_data_type/1 \ + autofill_noplatform/ensemble/inconsistent_data_type/fp32_dim1_batch2/1 \ + autofill_noplatform/ensemble/inconsistent_data_type/int32_dim1_batch4/1 \ + autofill_noplatform/ensemble/non_existing_model/non_existing_model/1 \ + autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4/1 \ + autofill_noplatform/ensemble/non_existing_model/fp32_dim1_batch4_output3/1 \ + autofill_noplatform/ensemble/self_circular_dependency/self_circular_dependency/1 \ + autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4/1 \ + autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_input4/1 \ + autofill_noplatform/ensemble/self_circular_dependency/fp32_dim1_batch4_output3/1 \ + autofill_noplatform/ensemble/unmapped_input/unmapped_input/1 \ + autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4/1 \ + autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_input4/1 \ + autofill_noplatform/ensemble/unmapped_input/fp32_dim1_batch4_output3/1 \ + autofill_noplatform/ensemble/circular_dependency/circular_dependency/1 \ + autofill_noplatform/ensemble/circular_dependency/circular_dependency_2/1 \ + autofill_noplatform/ensemble/no_required_version/no_required_version/1 \ + autofill_noplatform/ensemble/no_required_version/simple/1 \ + autofill_noplatform/ensemble/no_required_version_2/no_required_version_2/1 \ + autofill_noplatform/ensemble/no_required_version_2/simple/1 \ + autofill_noplatform/ensemble/no_required_version_3/no_required_version_3/1 \ + autofill_noplatform/ensemble/no_required_version_3/simple/1 \ + autofill_noplatform_success/ensemble/embedded_ensemble/embedded_ensemble/1 \ + autofill_noplatform_success/ensemble/embedded_ensemble/fp32_dim1_batch4/1 \ + autofill_noplatform_success/ensemble/embedded_ensemble/inner_ensemble/1 \ + autofill_noplatform_success/ensemble/inconsistent_shape/inconsistent_shape/1 \ + autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim1_batch4/1 \ + autofill_noplatform_success/ensemble/inconsistent_shape/fp32_dim2_nobatch/1 \ + autofill_noplatform_success/ensemble/inconsistent_shape_2/inconsistent_shape_2/1 \ + autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim1_batch4/1 \ + autofill_noplatform_success/ensemble/inconsistent_shape_2/fp32_dim2_nobatch/1 \ + autofill_noplatform_success/ensemble/unmapped_output/unmapped_output/1 \ + autofill_noplatform_success/ensemble/unmapped_output/fp32_dim1_batch4_output3/1 ; do + mkdir -p $modelpath +done + +for modelpath in \ + autofill_noplatform/ensemble/invalid_decoupled_branching/repeat_int32/1 \ + autofill_noplatform/ensemble/invalid_decoupled_branching_2/repeat_int32/1; do + mkdir -p $modelpath + cp ./libtriton_repeat.so $modelpath/libtriton_repeat.so +done + +# Copy PyTorch models into the test model repositories. +for modelpath in \ + autofill_noplatform/pytorch/too_few_inputs/1 \ + autofill_noplatform/pytorch/too_few_outputs/1 \ + autofill_noplatform_success/pytorch/no_name_platform/1 \ + autofill_noplatform_success/pytorch/cpu_instance/1 ; do + mkdir -p $modelpath + cp /data/inferenceserver/${REPO_VERSION}/qa_model_repository/libtorch_float32_float32_float32/1/model.pt \ + $modelpath/. +done + +# Copy Python models into the test model repositories. +for modelpath in \ + autofill_noplatform/python/input_mismatch_datatype/1 \ + autofill_noplatform/python/input_mismatch_dims/1 \ + autofill_noplatform/python/output_mismatch_datatype/1 \ + autofill_noplatform/python/output_mismatch_dims/1 \ + autofill_noplatform_success/python/incomplete_output/1 \ + autofill_noplatform_success/python/unknown_input/1 \ + autofill_noplatform_success/python/unknown_output/1 \ + autofill_noplatform_success/python/empty_config/1 ; do + mkdir -p $modelpath + cp /opt/tritonserver/qa/python_models/auto_complete/model.py $modelpath/. +done +for modelpath in \ + autofill_noplatform/python/conflicting_max_batch_size \ + autofill_noplatform/python/input_missing_datatype \ + autofill_noplatform/python/input_missing_dims \ + autofill_noplatform/python/input_missing_name \ + autofill_noplatform/python/output_missing_datatype \ + autofill_noplatform/python/output_missing_dims \ + autofill_noplatform/python/output_missing_name \ + autofill_noplatform/python/no_return \ + autofill_noplatform/python/conflicting_scheduler_sequence \ + autofill_noplatform_success/python/dynamic_batching_no_op \ + autofill_noplatform_success/python/dynamic_batching \ + autofill_noplatform_success/python/incomplete_input \ + autofill_noplatform_success/python/model_transaction_policy \ + autofill_noplatform_success/python/model_transaction_policy_decoupled_false \ + autofill_noplatform_success/python/model_transaction_policy_no_op \ + autofill_noplatform_success/python/optional_input \ + autofill_noplatform/python/input_wrong_property \ + autofill_noplatform/python/model_transaction_policy_invalid_args \ + autofill_noplatform/python/model_transaction_policy_mismatch \ + autofill_noplatform/python/output_wrong_property ; do + mkdir -p $modelpath/1 + cp $modelpath/model.py $modelpath/1/. +done +for modelpath in \ + autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble \ + autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step \ + autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step ; do + mkdir -p $modelpath/1 + cp $modelpath/model.py $modelpath/1/. +done + +# Make version folders for custom test model repositories. +for modelpath in \ + autofill_noplatform/custom/no_delimiter/1 \ + autofill_noplatform/custom/unknown_backend.unknown/1 \ + autofill_noplatform_success/custom/empty_config.identity/1 \ + autofill_noplatform_success/custom/no_backend.identity/1 ; do + mkdir -p $modelpath +done + +# Make version folders as the instance group validation is deferred to +# the beginning of model creation +for modelpath in \ + noautofill_platform/invalid_cpu/1 \ + noautofill_platform/invalid_gpu/1 \ + noautofill_platform/negative_gpu/1 ; do + mkdir -p $modelpath +done + +# Copy other required models +mkdir -p special_cases/invalid_platform/1 +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/savedmodel_float32_float32_float32/1/model.savedmodel \ + special_cases/invalid_platform/1/ +# Note that graphdef models don't support auto-complete-config +# and that is why we are using graphdef model in this test case. +mkdir -p special_cases/noautofill_noconfig/1 +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/graphdef_float32_float32_float32/1/model.graphdef \ + special_cases/noautofill_noconfig/1/ +# Create runtime escape scenario +mkdir -p special_cases/runtime_escape/1 special_cases/runtime_escape/dummy_runtime +touch special_cases/runtime_escape/dummy_runtime/libtriton_identity.so +# Setup invalid runtime model +mkdir -p special_cases/invalid_runtime/1 + +# Copy reshape model files into the test model repositories. +mkdir -p autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/1 +cp /data/inferenceserver/${REPO_VERSION}/qa_reshape_model_repository/graphdef_zero_2_float32/1/model.graphdef \ + autofill_noplatform_success/tensorflow_graphdef/reshape_config_provided/1 + +mkdir -p autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/1 +cp -r /data/inferenceserver/${REPO_VERSION}/qa_reshape_model_repository/savedmodel_zero_2_float32/1/model.savedmodel \ + autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/1 + +mkdir -p autofill_noplatform_success/tensorrt/reshape_config_provided/1 +cp /data/inferenceserver/${REPO_VERSION}/qa_reshape_model_repository/plan_zero_4_float32/1/model.plan \ + autofill_noplatform_success/tensorrt/reshape_config_provided/1 + +# Copy identity model into onnx test directories +mkdir -p autofill_noplatform_success/onnx/cpu_instance/1 +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/onnx_zero_1_float16/1/model.onnx \ + autofill_noplatform_success/onnx/cpu_instance/1 + +# Copy openvino models into test directories +for modelpath in \ + autofill_noplatform/openvino/bad_input_dims \ + autofill_noplatform/openvino/bad_output_dims \ + autofill_noplatform/openvino/too_few_inputs \ + autofill_noplatform/openvino/too_many_inputs \ + autofill_noplatform/openvino/unknown_input \ + autofill_noplatform/openvino/unknown_output \ + autofill_noplatform_success/openvino/empty_config \ + autofill_noplatform_success/openvino/no_config; do + cp -r /opt/tritonserver/qa/openvino_models/fixed_batch/1 $modelpath +done +cp -r /opt/tritonserver/qa/openvino_models/dynamic_batch/1 \ + autofill_noplatform_success/openvino/dynamic_batch +# Copy openvino model from qa_model_repository +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/openvino_int8_int8_int8/1 \ + autofill_noplatform_success/openvino/partial_config +cp /data/inferenceserver/${REPO_VERSION}/qa_model_repository/openvino_int8_int8_int8/output0_labels.txt \ + autofill_noplatform_success/openvino/partial_config + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG +RET=0 + +# Run tests for logs which do not have a timestamp on them +for TARGET in `ls cli_messages`; do + case $TARGET in + "cli_override") + EXTRA_ARGS="--disable-auto-complete-config --strict-model-config=false" ;; + "cli_deprecation") + EXTRA_ARGS="--strict-model-config=true" ;; + *) + EXTRA_ARGS="" ;; + esac + + SERVER_ARGS="--model-repository=`pwd`/models $EXTRA_ARGS" + SERVER_LOG=$SERVER_LOG_BASE.cli_messages_${TARGET}.log + + rm -fr models && mkdir models + cp -r cli_messages/$TARGET models/. + + EXPECTEDS=models/$TARGET/expected* + + echo -e "Test on cli_messages/$TARGET" >> $CLIENT_LOG + + run_server + if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID + else + EXFOUND=0 + for EXPECTED in `ls $EXPECTEDS`; do + EX=`cat $EXPECTED` + echo "grepping for: $EX" + if grep "$EX" $SERVER_LOG; then + echo -e "Found \"$EX\"" >> $CLIENT_LOG + EXFOUND=1 + break + else + echo -e "Not found \"$EX\"" >> $CLIENT_LOG + fi + done + if [ "$EXFOUND" == "0" ]; then + echo -e "*** FAILED: cli_messages/$TARGET" >> $CLIENT_LOG + RET=1 + fi + fi +done + +# Run special test cases +for TARGET in `ls special_cases`; do + case $TARGET in + "invalid_platform") + EXTRA_ARGS="--disable-auto-complete-config" ;; + *) + EXTRA_ARGS="" ;; + esac + + SERVER_ARGS="--model-repository=`pwd`/models $EXTRA_ARGS" + SERVER_LOG=$SERVER_LOG_BASE.special_case_${TARGET}.log + + rm -fr models && mkdir models + cp -r special_cases/$TARGET models/. + + CONFIG=models/$TARGET/config.pbtxt + EXPECTEDS=models/$TARGET/expected* + + echo -e "Test on special_cases/$TARGET" >> $CLIENT_LOG + + # We expect all the tests to fail with one of the expected + # error messages + run_server + if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID + else + EXFOUND=0 + for EXPECTED in `ls $EXPECTEDS`; do + EX=`cat $EXPECTED` + if grep ^E[0-9][0-9][0-9][0-9].*"$EX" $SERVER_LOG; then + echo -e "Found \"$EX\"" >> $CLIENT_LOG + EXFOUND=1 + break + else + echo -e "Not found \"$EX\"" >> $CLIENT_LOG + fi + done + if [ "$EXFOUND" == "0" ]; then + echo -e "*** FAILED: special_cases/$TARGET" >> $CLIENT_LOG + RET=1 + fi + fi +done + +# Run noautofill unittest +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --log-verbose=1" +SERVER_LOG=$SERVER_LOG_BASE.special_case_noautofill_test.log + +rm -fr models && mkdir models +cp -r special_cases/noautofill_noconfig models/. + +echo -e "Test on special_cases/noautofill_test" >> $CLIENT_LOG + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python noautofill_test.py >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Python NoAutoFill Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +for TRIAL in $TRIALS; do + # Run all tests that require no autofill but that add the platform to + # the model config before running the test + for TARGET in `ls noautofill_platform`; do + SERVER_ARGS="--model-repository=`pwd`/models --strict-model-config=true" + SERVER_LOG=$SERVER_LOG_BASE.noautofill_platform_${TRIAL}_${TARGET}.log + + rm -fr models && mkdir models + cp -r noautofill_platform/$TARGET models/. + + CONFIG=models/$TARGET/config.pbtxt + EXPECTEDS=models/$TARGET/expected* + + # If there is a config.pbtxt change/add platform to it + if [ -f $CONFIG ]; then + sed -i '/platform:/d' $CONFIG + echo "platform: \"$TRIAL\"" >> $CONFIG + cat $CONFIG + fi + + echo -e "Test platform $TRIAL on noautofill_platform/$TARGET" >> $CLIENT_LOG + + # We expect all the tests to fail with one of the expected + # error messages + run_server + if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID + else + EXFOUND=0 + for EXPECTED in `ls $EXPECTEDS`; do + EX=`cat $EXPECTED` + if grep ^E[0-9][0-9][0-9][0-9].*"$EX" $SERVER_LOG; then + echo -e "Found \"$EX\"" >> $CLIENT_LOG + EXFOUND=1 + break + else + echo -e "Not found \"$EX\"" >> $CLIENT_LOG + fi + done + + if [ "$EXFOUND" == "0" ]; then + echo -e "*** FAILED: platform $TRIAL noautofill_platform/$TARGET" >> $CLIENT_LOG + RET=1 + fi + fi + done +done + +for TRIAL in $TRIALS; do + # Run all tests that require no autofill but that add the platform to + # the model config before running the test + for TARGET in `ls noautofill_platform`; do + SERVER_ARGS="--model-repository=`pwd`/models --disable-auto-complete-config" + SERVER_LOG=$SERVER_LOG_BASE.noautofill_platform_disableflag_${TRIAL}_${TARGET}.log + + rm -fr models && mkdir models + cp -r noautofill_platform/$TARGET models/. + + CONFIG=models/$TARGET/config.pbtxt + EXPECTEDS=models/$TARGET/expected* + + # If there is a config.pbtxt change/add platform to it + if [ -f $CONFIG ]; then + sed -i '/platform:/d' $CONFIG + echo "platform: \"$TRIAL\"" >> $CONFIG + cat $CONFIG + fi + + echo -e "Test platform $TRIAL on noautofill_platform/$TARGET with disable-auto-complete-config flag" >> $CLIENT_LOG + + # We expect all the tests to fail with one of the expected + # error messages + run_server + if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID + else + EXFOUND=0 + for EXPECTED in `ls $EXPECTEDS`; do + EX=`cat $EXPECTED` + if grep ^E[0-9][0-9][0-9][0-9].*"$EX" $SERVER_LOG; then + echo -e "Found \"$EX\"" >> $CLIENT_LOG + EXFOUND=1 + break + else + echo -e "Not found \"$EX\"" >> $CLIENT_LOG + fi + done + + if [ "$EXFOUND" == "0" ]; then + echo -e "*** FAILED: platform $TRIAL noautofill_platform/$TARGET with disable-auto-complete-config flag" >> $CLIENT_LOG + RET=1 + fi + fi + done +done + +# Run all autofill tests that don't add a platform to the model config +# before running the test +for TARGET_DIR in `ls -d autofill_noplatform/*/*`; do + TARGET_DIR_DOT=`echo $TARGET_DIR | tr / .` + TARGET=`basename ${TARGET_DIR}` + + SERVER_ARGS="--model-repository=`pwd`/models --strict-model-config=false" + SERVER_LOG=$SERVER_LOG_BASE.${TARGET_DIR_DOT}.log + + # If there is a config.pbtxt at the top-level of the test then + # assume that the directory is a single model. Otherwise assume + # that the directory is an entire model repository. + rm -fr models && mkdir models + if [ -f ${TARGET_DIR}/config.pbtxt ]; then + cp -r ${TARGET_DIR} models/. + EXPECTEDS=models/$TARGET/expected* + else + cp -r ${TARGET_DIR}/* models/. + EXPECTEDS=models/expected* + fi + + echo -e "Test ${TARGET_DIR}" >> $CLIENT_LOG + + # We expect all the tests to fail with one of the expected + # error messages + run_server + if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID + else + EXFOUND=0 + for EXPECTED in `ls $EXPECTEDS`; do + EX=`cat $EXPECTED` + if grep ^E[0-9][0-9][0-9][0-9].*"$EX" $SERVER_LOG; then + echo -e "Found \"$EX\"" >> $CLIENT_LOG + EXFOUND=1 + break + else + echo -e "Not found \"$EX\"" >> $CLIENT_LOG + fi + done + + if [ "$EXFOUND" == "0" ]; then + echo -e "*** FAILED: ${TARGET_DIR}" >> $CLIENT_LOG + RET=1 + fi + fi +done + +# Run all autofill tests that are expected to be successful. These +# tests don't add a platform to the model config before running +for TARGET_DIR in `ls -d autofill_noplatform_success/*/*`; do + TARGET_DIR_DOT=`echo $TARGET_DIR | tr / .` + TARGET=`basename ${TARGET_DIR}` + + SERVER_ARGS="--model-repository=`pwd`/models --strict-model-config=false" + SERVER_LOG=$SERVER_LOG_BASE.${TARGET_DIR_DOT}.log + + # If there is a config.pbtxt at the top-level of the test then + # assume that the directory is a single model. Otherwise assume + # that the directory is an entire model repository. + rm -fr models && mkdir models + if [ -f ${TARGET_DIR}/config.pbtxt ] || [ "$TARGET" = "no_config" ] \ + || [ "$TARGET" = "no_config_variable" ] || [ "$TARGET" = "no_config_shape_tensor" ] \ + || [ "$TARGET" = "no_config_non_linear_format_io" ] ; then + cp -r ${TARGET_DIR} models/. + else + cp -r ${TARGET_DIR}/* models/. + fi + + echo -e "Test $TARGET_DIR" >> $CLIENT_LOG + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "*** FAILED: unable to start $SERVER" >> $CLIENT_LOG + RET=1 + else + set +e + python ./compare_status.py --expected_dir models/$TARGET --model $TARGET >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "*** FAILED: unexpected model config" >> $CLIENT_LOG + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + fi +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" + cat $CLIENT_LOG +fi + +exit $RET diff --git a/qa/L0_model_namespacing/python_addsub/__init__.py b/qa/L0_model_namespacing/python_addsub/__init__.py new file mode 100755 index 0000000000..a664eafef0 --- /dev/null +++ b/qa/L0_model_namespacing/python_addsub/__init__.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + # Use auto complete feature to ship config.pbtxt along with the Python + # model definition + @staticmethod + def auto_complete_config(auto_complete_model_config): + # Only use packaged config if config is not explicitly provided + config = auto_complete_model_config.as_dict() + if (len(config["input"]) != 0) or (len(config["output"]) != 0): + return auto_complete_model_config + + auto_complete_model_config.add_input( + { + "name": "INPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_input( + { + "name": "INPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + return auto_complete_model_config + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + responses.append(pb_utils.InferenceResponse(self.addsub(in_0, in_1))) + return responses + + def addsub(self, in_0, in_1): + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + ) + else: + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype)) + return [out_tensor_0, out_tensor_1] diff --git a/qa/L0_model_namespacing/python_subadd/__init__.py b/qa/L0_model_namespacing/python_subadd/__init__.py new file mode 100755 index 0000000000..bd3ddefe9e --- /dev/null +++ b/qa/L0_model_namespacing/python_subadd/__init__.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + # Use auto complete feature to ship config.pbtxt along with the Python + # model definition + @staticmethod + def auto_complete_config(auto_complete_model_config): + # Only use packaged config if config is not explicitly provided + config = auto_complete_model_config.as_dict() + if (len(config["input"]) != 0) or (len(config["output"]) != 0): + return auto_complete_model_config + + auto_complete_model_config.add_input( + { + "name": "INPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_input( + { + "name": "INPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + return auto_complete_model_config + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + responses.append(pb_utils.InferenceResponse(self.subadd(in_0, in_1))) + return responses + + def subadd(self, in_0, in_1): + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + ) + else: + out_0, out_1 = ( + in_0.as_numpy() - in_1.as_numpy(), + in_0.as_numpy() + in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype)) + return [out_tensor_0, out_tensor_1] diff --git a/qa/L0_model_namespacing/test.py b/qa/L0_model_namespacing/test.py new file mode 100755 index 0000000000..f45300d4fd --- /dev/null +++ b/qa/L0_model_namespacing/test.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append(os.path.join(os.environ["TRITON_QA_ROOT_DIR"], "common")) + +import shutil +import time +import unittest + +import numpy as np +import test_util as tu +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + +# +# Test utilities +# + + +# Checker to perform inference on given model, expecting model to have +# [INPUT0, INPUT1] and produce [OUTPUT0, OUTPUT1] where: +# OUTPUT0 = INPUT0 + INPUT1 +# OUTPUT1 = INPUT0 - INPUT1 +class AddSubChecker: + # Optional 'checker_client' may be provided to use a different + # Triton client library, currently it must be either Triton HTTP client + # library or Triton GRPC client library + def __init__(self, checker_client=None): + # client library selection + if checker_client is None: + import tritonclient.http as checker_client + if "http" in checker_client.__name__: + self.client_ = checker_client.InferenceServerClient("localhost:8000") + else: + self.client_ = checker_client.InferenceServerClient("localhost:8001") + + # Create infer input tensors + self.inputs_ = [] + self.inputs_.append(checker_client.InferInput("INPUT0", [16], "INT32")) + self.inputs_.append(checker_client.InferInput("INPUT1", [16], "INT32")) + + # Initialize the data and expected output + input_data = np.arange(start=0, stop=16, dtype=np.int32) + self.inputs_[0].set_data_from_numpy(input_data) + self.inputs_[1].set_data_from_numpy(input_data) + self.expected_outputs_ = { + "add": (input_data + input_data), + "sub": (input_data - input_data), + } + + def infer(self, model): + res = self.client_.infer(model, self.inputs_) + np.testing.assert_allclose( + res.as_numpy("OUTPUT0"), self.expected_outputs_["add"] + ) + np.testing.assert_allclose( + res.as_numpy("OUTPUT1"), self.expected_outputs_["sub"] + ) + + +# Checker to perform inference on given model, expecting model to have +# [INPUT0, INPUT1] and produce [OUTPUT0, OUTPUT1] where: +# OUTPUT0 = INPUT0 - INPUT1 +# OUTPUT1 = INPUT0 + INPUT1 +class SubAddChecker(AddSubChecker): + def infer(self, model): + res = self.client_.infer(model, self.inputs_) + np.testing.assert_allclose( + res.as_numpy("OUTPUT0"), self.expected_outputs_["sub"] + ) + np.testing.assert_allclose( + res.as_numpy("OUTPUT1"), self.expected_outputs_["add"] + ) + + +# +# Test suites and cases +# + + +class ModelNamespacePoll(tu.TestResultCollector): + def setUp(self): + self.addsub_ = AddSubChecker() + self.subadd_ = SubAddChecker() + # For other server interaction + self.client_ = httpclient.InferenceServerClient("localhost:8000") + + def check_health(self, expect_live=True, expect_ready=True): + self.assertEqual(self.client_.is_server_live(), expect_live) + self.assertEqual(self.client_.is_server_ready(), expect_ready) + + def test_no_duplication(self): + # Enable model namspacing on repositories that is already valid without + # enabling model namespacing. + # All models should be visible and can be inferred individually + self.check_health() + + # infer check + for model in ["simple_addsub", "composing_addsub"]: + self.addsub_.infer(model) + for model in ["simple_subadd", "composing_subadd"]: + self.subadd_.infer(model) + + def test_duplication(self): + # Enable model namspacing on repositories that each repo has one + # ensemble and it requires an composing model ('composing_model') that + # exists in both repos. + # Expect all models are visible, the ensemble will pick up the correct + # model even the composing model can't be inferred individually. + self.check_health() + + # infer check + for model in [ + "simple_addsub", + ]: + self.addsub_.infer(model) + for model in [ + "simple_subadd", + ]: + self.subadd_.infer(model) + + # error check + try: + self.addsub_.infer("composing_model") + self.assertTrue(False, "expected error for inferring ambiguous named model") + except InferenceServerException as ex: + self.assertIn("ambiguity", ex.message()) + + def test_ensemble_duplication(self): + # Enable model namspacing on repositories that each repo has one + # ensemble with the same name. Expect the ensemble will pick up the correct + # model. + # Expect all models are visible, the ensemble will pick up the correct + # model even the ensemble itself can't be inferred without providing + # namespace. + self.check_health() + + # infer + for model in [ + "composing_addsub", + ]: + self.addsub_.infer(model) + for model in [ + "composing_subadd", + ]: + self.subadd_.infer(model) + + # error check + try: + self.addsub_.infer("simple_ensemble") + self.assertTrue(False, "expected error for inferring ambiguous named model") + except InferenceServerException as ex: + self.assertIn("ambiguity", ex.message()) + + def test_dynamic_resolution(self): + # Same model setup as 'test_duplication', will remove / add one of the + # composing model at runtime and expect the ensemble to be properly + # linked to existing composing model at different steps. + # 1. Remove 'composing_model' in addsub_repo, expect both ensembles use + # 'composing_model' in subadd_repo and act as subadd + # 2. Add back 'composing_model' in addsub_repo, expect the ensembles to behave the + # same as before the removal. + self.assertTrue("NAMESPACE_TESTING_DIRCTORY" in os.environ) + td = os.environ["NAMESPACE_TESTING_DIRCTORY"] + composing_before_path = os.path.join(td, "addsub_repo", "composing_model") + composing_after_path = os.path.join(td, "composing_model") + + self.check_health() + # step 1. + shutil.move(composing_before_path, composing_after_path) + time.sleep(5) + + # infer + for model in ["simple_subadd", "simple_addsub", "composing_model"]: + self.subadd_.infer(model) + + # step 2. + shutil.move(composing_after_path, composing_before_path) + time.sleep(5) + + # infer + for model in [ + "simple_addsub", + ]: + self.addsub_.infer(model) + for model in [ + "simple_subadd", + ]: + self.subadd_.infer(model) + + # error check + try: + self.addsub_.infer("composing_model") + self.assertTrue(False, "expected error for inferring ambiguous named model") + except InferenceServerException as ex: + self.assertIn("ambiguity", ex.message()) + + +class ModelNamespaceExplicit(tu.TestResultCollector): + def setUp(self): + self.addsub_ = AddSubChecker() + self.subadd_ = SubAddChecker() + # For other server interaction + self.client_ = httpclient.InferenceServerClient("localhost:8000") + + def check_health(self, expect_live=True, expect_ready=True): + self.assertEqual(self.client_.is_server_live(), expect_live) + self.assertEqual(self.client_.is_server_ready(), expect_ready) + + def test_no_duplication(self): + # Enable model namspacing on repositories that is already valid without + # enabling model namespacing. + # All models should be visible and can be inferred individually + self.check_health() + # load ensembles, cascadingly load composing model + for model in ["simple_addsub", "simple_subadd"]: + self.client_.load_model(model) + + # infer + for model in ["simple_addsub", "composing_addsub"]: + self.addsub_.infer(model) + for model in ["simple_subadd", "composing_subadd"]: + self.subadd_.infer(model) + + def test_duplication(self): + # Enable model namspacing on repositories that each repo has one + # ensemble and it requires an composing model ('composing_model') that + # exists in both repos. + # Expect all models are visible, the ensemble will pick up the correct + # model even the composing model can't be inferred individually. + self.check_health() + # load ensembles, cascadingly load composing model + for model in ["simple_addsub", "simple_subadd"]: + self.client_.load_model(model) + + # infer + for model in [ + "simple_addsub", + ]: + self.addsub_.infer(model) + for model in [ + "simple_subadd", + ]: + self.subadd_.infer(model) + + # error check + try: + self.addsub_.infer("composing_model") + self.assertTrue(False, "expected error for inferring ambiguous named model") + except InferenceServerException as ex: + self.assertIn("ambiguity", ex.message()) + + def test_ensemble_duplication(self): + # Enable model namspacing on repositories that each repo has one + # ensemble with the same name. Expect the ensemble will pick up the correct + # model. + # Expect all models are visible, the ensemble will pick up the correct + # model even the ensemble itself can't be inferred without providing + # namespace. + self.check_health() + # load ensembles, cascadingly load composing model + for model in ["simple_ensemble"]: + self.client_.load_model(model) + + # infer + for model in [ + "composing_addsub", + ]: + self.addsub_.infer(model) + for model in [ + "composing_subadd", + ]: + self.subadd_.infer(model) + + # error check + try: + self.addsub_.infer("simple_ensemble") + self.assertTrue(False, "expected error for inferring ambiguous named model") + except InferenceServerException as ex: + self.assertIn("ambiguity", ex.message()) + + def test_dynamic_resolution(self): + # Same model setup as 'test_duplication', will remove / add one of the + # composing model at runtime and expect the ensemble to be properly + # linked to existing composing model at different steps. + # 1. Remove 'composing_model' in addsub_repo, expect both ensembles use + # 'composing_model' in subadd_repo and act as subadd. + # 2. Add back 'composing_model' in addsub_repo, expect the ensembles to behave the + # same as before the removal. + self.assertTrue("NAMESPACE_TESTING_DIRCTORY" in os.environ) + td = os.environ["NAMESPACE_TESTING_DIRCTORY"] + composing_before_path = os.path.join(td, "addsub_repo", "composing_model") + composing_after_path = os.path.join(td, "composing_model") + + self.check_health() + # step 1. + shutil.move(composing_before_path, composing_after_path) + # load ensembles, cascadingly load composing model + for model in ["simple_addsub", "simple_subadd"]: + self.client_.load_model(model) + + # infer + for model in ["simple_subadd", "simple_addsub", "composing_model"]: + self.subadd_.infer(model) + + # step 2. + shutil.move(composing_after_path, composing_before_path) + # Explicitly load one of the ensembel, should still trigger cascading + # (re-)load + for model in [ + "simple_addsub", + ]: + self.client_.load_model(model) + + # infer + for model in [ + "simple_addsub", + ]: + self.addsub_.infer(model) + for model in [ + "simple_subadd", + ]: + self.subadd_.infer(model) + + # error check + try: + self.addsub_.infer("composing_model") + self.assertTrue(False, "expected error for inferring ambiguous named model") + except InferenceServerException as ex: + self.assertIn("ambiguity", ex.message()) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_model_namespacing/test.sh b/qa/L0_model_namespacing/test.sh new file mode 100755 index 0000000000..414bd3dde9 --- /dev/null +++ b/qa/L0_model_namespacing/test.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TRITON_QA_ROOT_DIR=${TRITON_QA_ROOT_DIR:="/opt/tritonserver/qa"} +source $TRITON_QA_ROOT_DIR/common/util.sh + +RET=0 + +TEST_PY=./test.py +# tests are run individually +EXPECTED_NUM_TESTS="1" +TEST_RESULT_FILE='test_results.txt' + + +export CUDA_VISIBLE_DEVICES=0 +export TRITON_QA_ROOT_DIR=$TRITON_QA_ROOT_DIR +export TRITON_QA_PYTHON_MODEL_DIR=$TRITON_QA_ROOT_DIR/L0_model_namespacing + +rm -fr *.log + +REPO_ARGS="--model-namespacing=true --model-repository=`pwd`/test_dir/addsub_repo --model-repository=`pwd`/test_dir/subadd_repo" +POLL_ARGS="--model-control-mode=POLL --repository-poll-secs=2" +EXPLICIT_ARGS="--model-control-mode=EXPLICIT" + +SERVER=/opt/tritonserver/bin/tritonserver + +# List all tests as each test will use different repo configuration +TEST_LIST=${TEST_LIST:="test_duplication \ + test_dynamic_resolution \ + test_ensemble_duplication \ + test_no_duplication"} + +# Helper to make sure all ensemble have version directory +CURR_DIR=`pwd` +for test_name in $TEST_LIST; do + for model_dir in $CURR_DIR/$test_name/*/*; do + mkdir -p $model_dir/1 + done +done + +# Set this variable to avoid generation of '__pycache__' in the model directory, +# which will cause unintended model reload in POLLING model as Triton sees +# changes in the model directory +export PYTHONDONTWRITEBYTECODE=1 + +# Polling +for test_name in $TEST_LIST; do + TEST_SUITE="ModelNamespacePoll" + TEST_LOG="`pwd`/test.$TEST_SUITE.$test_name.log" + SERVER_LOG="./server.$TEST_SUITE.$test_name.log" + + rm -fr `pwd`/test_dir + cp -r `pwd`/$test_name `pwd`/test_dir + SERVER_ARGS="$REPO_ARGS $POLL_ARGS" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + # Pass in the test directory as the test may modify the structure + NAMESPACE_TESTING_DIRCTORY=`pwd`/test_dir python $TEST_PY $TEST_SUITE.$test_name >>$TEST_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + cat $TEST_LOG + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $TEST_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Explicit +for test_name in $TEST_LIST; do + TEST_SUITE="ModelNamespaceExplicit" + TEST_LOG="`pwd`/test.$TEST_SUITE.$test_name.log" + SERVER_LOG="./server.$TEST_SUITE.$test_name.log" + + rm -fr `pwd`/test_dir + cp -r `pwd`/$test_name `pwd`/test_dir + SERVER_ARGS="$REPO_ARGS $EXPLICIT_ARGS" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + # Pass in the test directory as the test may modify the structure + NAMESPACE_TESTING_DIRCTORY=`pwd`/test_dir python $TEST_PY $TEST_SUITE.$test_name >>$TEST_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + cat $TEST_LOG + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $TEST_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py new file mode 100644 index 0000000000..13a611e7a3 --- /dev/null +++ b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_addsub import * diff --git a/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt new file mode 100644 index 0000000000..245e256976 --- /dev/null +++ b/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt @@ -0,0 +1,90 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + + + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_model" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py new file mode 100644 index 0000000000..664c20b58f --- /dev/null +++ b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_subadd import * diff --git a/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt new file mode 100644 index 0000000000..85d8ec0051 --- /dev/null +++ b/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt @@ -0,0 +1,88 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_model" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py new file mode 100644 index 0000000000..13a611e7a3 --- /dev/null +++ b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_addsub import * diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt new file mode 100644 index 0000000000..245e256976 --- /dev/null +++ b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt @@ -0,0 +1,90 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + + + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_model" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py new file mode 100644 index 0000000000..664c20b58f --- /dev/null +++ b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_subadd import * diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt new file mode 100644 index 0000000000..245e256976 --- /dev/null +++ b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt @@ -0,0 +1,90 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + + + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_model" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py new file mode 100644 index 0000000000..13a611e7a3 --- /dev/null +++ b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_addsub import * diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt new file mode 100644 index 0000000000..2a9f0003a3 --- /dev/null +++ b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt @@ -0,0 +1,90 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + + + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_addsub" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py new file mode 100644 index 0000000000..664c20b58f --- /dev/null +++ b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_subadd import * diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt new file mode 100644 index 0000000000..0ee1015f25 --- /dev/null +++ b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt @@ -0,0 +1,90 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + + + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_subadd" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py new file mode 100644 index 0000000000..13a611e7a3 --- /dev/null +++ b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_addsub import * diff --git a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt new file mode 100644 index 0000000000..2a9f0003a3 --- /dev/null +++ b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt @@ -0,0 +1,90 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + + + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_addsub" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py new file mode 100644 index 0000000000..664c20b58f --- /dev/null +++ b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py @@ -0,0 +1,6 @@ +import os +import sys + +# load pre-defined QA model +sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) +from python_subadd import * diff --git a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt new file mode 100644 index 0000000000..0ee1015f25 --- /dev/null +++ b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt @@ -0,0 +1,90 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 +version_policy: { all { }} + + + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "composing_subadd" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_model_queue/ensemble_zero_1_float32/config.pbtxt b/qa/L0_model_queue/ensemble_zero_1_float32/config.pbtxt new file mode 100644 index 0000000000..8cf3d53e79 --- /dev/null +++ b/qa/L0_model_queue/ensemble_zero_1_float32/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble_zero_1_float32" +platform: "ensemble" +max_batch_size: 32 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "custom_zero_1_float32" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + } + ] +} \ No newline at end of file diff --git a/qa/L0_model_queue/model_queue_test.py b/qa/L0_model_queue/model_queue_test.py new file mode 100755 index 0000000000..025d126417 --- /dev/null +++ b/qa/L0_model_queue/model_queue_test.py @@ -0,0 +1,677 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import re +import threading +import time +import unittest +from builtins import range +from ctypes import * + +import infer_util as iu +import numpy as np +import requests +import test_util as tu +from tritonclientutils import InferenceServerException + +_max_queue_delay_ms = 10000 + +_deferred_exceptions_lock = threading.Lock() +_deferred_exceptions = [] + + +class ModelQueueTest(tu.TestResultCollector): + def setUp(self): + self.trials_ = [] + for base in ["custom", "ensemble"]: + for is_http_trial in [True, False]: + self.trials_.append({"base": base, "is_http_trial": is_http_trial}) + global _deferred_exceptions + _deferred_exceptions = [] + + def add_deferred_exception(self, ex): + global _deferred_exceptions + with _deferred_exceptions_lock: + _deferred_exceptions.append(ex) + + def check_deferred_exception(self): + # Just raise one of the exceptions... + with _deferred_exceptions_lock: + if len(_deferred_exceptions) > 0: + first_exception = _deferred_exceptions[0] + _deferred_exceptions.pop(0) + raise first_exception + + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metrics_before_test(self, model, reason): + pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)' + metrics = self._get_metrics() + match = re.search(pattern, metrics) + if match: + return int(match.group(1)) + else: + raise Exception(f"Failure metrics for model='{model}' not found") + + def _assert_metrics( + self, model_name, reason, expected_count_increase, initial_count + ): + metrics = self._get_metrics() + # Add initial count + expected count for the the test + expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}' + self.assertIn(expected_metric, metrics) + + def check_response( + self, + bs, + dtype, + shapes, + priority, + timeout_us, + thresholds, + base="custom", + is_http_trial=True, + ): + full_shapes = [ + [ + bs, + ] + + shape + for shape in shapes + ] + try: + start_ms = int(round(time.time() * 1000)) + iu.infer_zero( + self, + base, + bs, + dtype, + full_shapes, + full_shapes, + model_version=1, + use_http_json_tensors=False, + use_http=is_http_trial, + use_grpc=(not is_http_trial), + use_streaming=False, + priority=priority, + timeout_us=timeout_us, + ) + + end_ms = int(round(time.time() * 1000)) + + lt_ms = thresholds[0] + gt_ms = thresholds[1] + if lt_ms is not None: + self.assertTrue( + (end_ms - start_ms) < lt_ms, + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + if gt_ms is not None: + self.assertTrue( + (end_ms - start_ms) > gt_ms, + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + except Exception as ex: + self.add_deferred_exception(ex) + + def test_max_queue_size(self): + # Send a request with a static batch size == preferred size to trigger + # model execution. Then sends 10 requests to overload the model queue, + # expecting 2 of the requests are returned with error code immediately. + dtype = np.float32 + shapes = ([16],) + + for trial in self.trials_: + preceding_thread = threading.Thread( + target=self.check_response, + args=(8, dtype, shapes, 0, 0, (5999, 1000)), + ) + threads = [] + for i in range(10): + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (None, None)), + kwargs=trial, + ) + ) + preceding_thread.start() + time.sleep(0.5) + for t in threads: + t.start() + + preceding_thread.join() + for t in threads: + t.join() + + # Expect exactly two exception with exceeding max queue size error + expected_exceeded_count = 2 + exceeded_count = 0 + for i in range(expected_exceeded_count): + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue( + "Exceeds maximum queue size" in ex.message(), + 'Expected error message "Exceeds maximum queue size", got: {}'.format( + ex + ), + ) + exceeded_count = exceeded_count + 1 + self.assertEqual( + exceeded_count, + expected_exceeded_count, + "expected {} requests to fail with exceeded max queue size error, got {}".format( + expected_exceeded_count, exceeded_count + ), + ) + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_policy_delay(self): + # Send requests with batch sizes 1, 1, 3 where the second and third + # requests are sent after 'default_timeout_microseconds'. + # Expect the first request is timed-out and delayed, which makes the + # second and third request be batched together and executed. While the + # first request must wait for 'max_queue_delay_microseconds' until it + # can be executed. + dtype = np.float32 + shapes = ([16],) + for trial in self.trials_: + try: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (15000, 10000)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) + threads[0].start() + time.sleep(0.2) + threads[1].start() + threads[2].start() + + for t in threads: + t.join() + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_policy_reject(self): + # Send requests with batch sizes 1, 1, 3 where the second and third + # requests are sent after 'default_timeout_microseconds'. + # Expect the first request is timed-out and rejected, which makes the + # second and third request be batched together and executed. + initial_metrics_value_ensemble = self._metrics_before_test( + "ensemble_zero_1_float32", "OTHER" + ) + initial_metrics_value_custom = self._metrics_before_test( + "custom_zero_1_float32", "REJECTED" + ) + dtype = np.float32 + shapes = ([16],) + for trial in self.trials_: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (None, None)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) + threads[0].start() + time.sleep(0.2) + threads[1].start() + threads[2].start() + + for t in threads: + t.join() + + # Expect only one error for rejection + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue( + "Request timeout expired" in ex.message(), + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) + + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + expected_count_increase = 4 + # NOTE: Ensemble failure metrics will reflect the failure counts + # of their composing models as well as the parent model, but currently do not capture the same granularity + # for the "reason" label and will default to the "OTHER" reason. + self._assert_metrics( + "ensemble_zero_1_float32", + "OTHER", + expected_count_increase, + initial_metrics_value_ensemble, + ) + expected_count_increase = 4 + self._assert_metrics( + "custom_zero_1_float32", + "REJECTED", + expected_count_increase, + initial_metrics_value_custom, + ) + + def test_timeout_override(self): + # Send requests with batch sizes 1, 1, 3 where the first request + # overrides the timeout to be less than 'default_timeout_microseconds', + # and the second and third requests are sent after the overridden + # timeout. Expect the first request is timed-out and rejected before + # 'default_timeout_microseconds', which makes the second and third + # request be batched together and executed earlier than + # 'default_timeout_microseconds'. + + dtype = np.float32 + shapes = ([16],) + for trial in self.trials_: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 100000, (None, None)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) + threads[0].start() + time.sleep(0.2) + threads[1].start() + threads[2].start() + + for t in threads: + t.join() + + # Expect only one error for rejection + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue( + "Request timeout expired" in ex.message(), + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) + + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Check that timeout larger than 'default_timeout_microseconds' will not + # override, the last two requests will be processed only after + # 'default_timeout_microseconds' and before queue delay. + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 10000000, (None, None)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) + threads[0].start() + time.sleep(0.2) + threads[1].start() + threads[2].start() + + for t in threads: + t.join() + + # Expect only one error for rejection + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue( + "Request timeout expired" in ex.message(), + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) + + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Sanity check that without override, the last two requests will be + # processed only after 'default_timeout_microseconds' + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (None, None)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) + threads[0].start() + time.sleep(0.2) + threads[1].start() + threads[2].start() + + for t in threads: + t.join() + + # Expect only one error for rejection + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue( + "Request timeout expired" in ex.message(), + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) + + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_priority_levels(self): + # Send 2 requests with batch sizes 2, 1 in default priority. Then send + # 1 request with batch size 2 in priority 1. Expect the third request is + # place in the front of the queue and form a preferred batch with the + # first request. + dtype = np.float32 + shapes = ([16],) + for trial in self.trials_: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (500, 200)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (15000, 10000)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (100, 0)), + kwargs=trial, + ) + ) + threads[0].start() + # wait to make sure the order is correct + time.sleep(0.1) + threads[1].start() + time.sleep(0.2) + threads[2].start() + + for t in threads: + t.join() + + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_max_priority_levels(self): + # Send 2 requests with batch sizes 2, 1 in default priority (MAX_UINT32+1). Then send + # 1 request with batch size 2 in priority 1. Expect the third request is + # place in the front of the queue and form a preferred batch with the + # first request. + dtype = np.float32 + shapes = ([16],) + MAX_UINT32_PLUS_1 = 4294967296 + for trial in self.trials_: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (500, 200)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, MAX_UINT32_PLUS_1, 0, (15000, 10000)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (100, 0)), + kwargs=trial, + ) + ) + threads[0].start() + # wait to make sure the order is correct + time.sleep(0.1) + threads[1].start() + time.sleep(0.2) + threads[2].start() + + for t in threads: + t.join() + + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_priority_with_policy(self): + # Two set of requests are being sent at different priority levels + # in sequence: + # priority 1: + # batch size 2, default timeout + # batch size 1, short timeout + # batch size 2, default timeout + # priority 2: + # batch size 2, medium timeout + # batch size 3, default timeout + # batch size 6, default timeout + # Expecting that by the time when the last request, second request in + # priority 2, is sent, the requests with short timeout will be handled + # accordingly, and the queue becomes: + # priority 1: + # batch size 2, default timeout (1st batch) + # batch size 2, default timeout (1st batch) + # batch size 1, short timeout (delayed, will be 2nd batch) + # priority 2: + # batch size 2, medium timeout (will be rejected) + # batch size 3, default timeout (will be 2nd batch) + # batch size 6, default timeout (will be 3rd batch) + + dtype = np.float32 + shapes = ([16],) + for trial in self.trials_: + threads = [] + # The expected ranges may not be rounded to accommodate + # the sleep between sending requests + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (2000, 1000)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 1, 1000000, (3400, 2400)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (1700, 700)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 2, 2000000, (None, None)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(3, dtype, shapes, 2, 0, (2700, 1700)), + kwargs=trial, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(6, dtype, shapes, 2, 0, (15000, 10000)), + kwargs=trial, + ) + ) + for t in threads: + t.start() + time.sleep(0.2) + + for t in threads: + t.join() + + # Expect only one error for rejection + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue( + "Request timeout expired" in ex.message(), + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) + + try: + self.check_deferred_exception() + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_model_queue/test.sh b/qa/L0_model_queue/test.sh new file mode 100755 index 0000000000..577b7b7fc2 --- /dev/null +++ b/qa/L0_model_queue/test.sh @@ -0,0 +1,394 @@ +#!/bin/bash +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +CLIENT_LOG="./client.log" +MODEL_QUEUE_TEST=model_queue_test.py + +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver + +SERVER_ARGS="--model-repository=`pwd`/models" + +source ../common/util.sh + +RET=0 + +# Must run on a single device or else the TRITONSERVER_DELAY_SCHEDULER +# can fail when the requests are distributed to multiple devices. +export CUDA_VISIBLE_DEVICES=0 + +# Prepare base model. Only test with custom backend as it is sufficient +rm -fr *.log models custom_zero_1_float32 +cp -r ../custom_models/custom_zero_1_float32 . && \ + mkdir -p ./custom_zero_1_float32/1 && \ + mkdir -p ./ensemble_zero_1_float32/1 + +(cd custom_zero_1_float32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + sed -i "s/max_batch_size:.*/max_batch_size: 32/g" config.pbtxt && \ + echo "instance_group [ { kind: KIND_CPU count: 1 }]" >> config.pbtxt) + +# test_max_queue_size +# For testing max queue size, we use delay in the custom model to +# create backlogs, "TRITONSERVER_DELAY_SCHEDULER" is not desired as queue size +# is capped by max queue size. +rm -fr models && mkdir models && \ + cp -r ensemble_zero_1_float32 models/. && \ + cp -r custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 4, 8 ]" >> config.pbtxt && \ + echo " default_queue_policy {" >> config.pbtxt && \ + echo " max_queue_size: 8" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}" >> config.pbtxt && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"5000\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +TEST_CASE=test_max_queue_size +SERVER_LOG="./$TEST_CASE.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_CASE" >>$CLIENT_LOG + +set +e +python $MODEL_QUEUE_TEST ModelQueueTest.$TEST_CASE >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# test_policy_delay +rm -fr models && mkdir models && \ + cp -r ensemble_zero_1_float32 models/. && \ + cp -r custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 4, 8 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 10000000" >> config.pbtxt && \ + echo " default_queue_policy {" >> config.pbtxt && \ + echo " timeout_action: DELAY" >> config.pbtxt && \ + echo " default_timeout_microseconds: 100000" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +TEST_CASE=test_policy_delay +SERVER_LOG="./$TEST_CASE.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_CASE" >>$CLIENT_LOG + +set +e +python $MODEL_QUEUE_TEST ModelQueueTest.$TEST_CASE >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# test_policy_reject +rm -fr models && mkdir models && \ + cp -r ensemble_zero_1_float32 models/. && \ + cp -r custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 4, 8 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 10000000" >> config.pbtxt && \ + echo " default_queue_policy {" >> config.pbtxt && \ + echo " default_timeout_microseconds: 100000" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +TEST_CASE=test_policy_reject +SERVER_LOG="./$TEST_CASE.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_CASE" >>$CLIENT_LOG + +set +e +python $MODEL_QUEUE_TEST ModelQueueTest.$TEST_CASE >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# test_timeout_override +rm -fr models && mkdir models && \ + cp -r ensemble_zero_1_float32 models/. && \ + cp -r custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 4, 8 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 10000000" >> config.pbtxt && \ + echo " default_queue_policy {" >> config.pbtxt && \ + echo " allow_timeout_override: true" >> config.pbtxt && \ + echo " default_timeout_microseconds: 1000000" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +TEST_CASE=test_timeout_override +SERVER_LOG="./$TEST_CASE.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_CASE" >>$CLIENT_LOG + +set +e +python $MODEL_QUEUE_TEST ModelQueueTest.$TEST_CASE >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# test_priority_levels +rm -fr models && mkdir models && \ + cp -r ensemble_zero_1_float32 models/. && \ + cp -r custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 4, 8 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 10000000" >> config.pbtxt && \ + echo " priority_levels: 2" >> config.pbtxt && \ + echo " default_priority_level: 2" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +TEST_CASE=test_priority_levels +SERVER_LOG="./$TEST_CASE.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_CASE" >>$CLIENT_LOG + +set +e +python $MODEL_QUEUE_TEST ModelQueueTest.$TEST_CASE >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +MAX_UINT64=18446744073709551615 +MAX_UINT32_PLUS_1=4294967296 + +# test_max_priority_levels +rm -fr models && mkdir models && \ + cp -r ensemble_zero_1_float32 models/. && \ + cp -r custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 4, 8 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 10000000" >> config.pbtxt && \ + echo " priority_levels: $MAX_UINT64" >> config.pbtxt && \ + echo " default_priority_level: $MAX_UINT32_PLUS_1" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +TEST_CASE=test_max_priority_levels +SERVER_LOG="./$TEST_CASE.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_CASE" >>$CLIENT_LOG + +set +e +python $MODEL_QUEUE_TEST ModelQueueTest.$TEST_CASE >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# test_priority_with_policy +# 2 levels and 2 policies: +# priority 1: delay +# priority 2: reject +rm -fr models && mkdir models && \ + cp -r ensemble_zero_1_float32 models/. && \ + cp -r custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 4, 8, 32 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 10000000" >> config.pbtxt && \ + echo " priority_levels: 2" >> config.pbtxt && \ + echo " default_priority_level: 2" >> config.pbtxt && \ + echo " default_queue_policy {" >> config.pbtxt && \ + echo " timeout_action: DELAY" >> config.pbtxt && \ + echo " allow_timeout_override: true" >> config.pbtxt && \ + echo " default_timeout_microseconds: 11000000" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " priority_queue_policy {" >> config.pbtxt && \ + echo " key: 2" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " timeout_action: REJECT" >> config.pbtxt && \ + echo " allow_timeout_override: true" >> config.pbtxt && \ + echo " default_timeout_microseconds: 11000000" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +TEST_CASE=test_priority_with_policy +SERVER_LOG="./$TEST_CASE.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_CASE" >>$CLIENT_LOG + +set +e +python $MODEL_QUEUE_TEST ModelQueueTest.$TEST_CASE >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_model_update/instance_update_test.py b/qa/L0_model_update/instance_update_test.py new file mode 100755 index 0000000000..1423632868 --- /dev/null +++ b/qa/L0_model_update/instance_update_test.py @@ -0,0 +1,649 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import concurrent.futures +import json +import os +import random +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient +from models.model_init_del.util import ( + disable_batching, + enable_batching, + get_count, + reset_count, + set_delay, + update_instance_group, + update_model_file, + update_sequence_batching, +) +from tritonclient.utils import InferenceServerException + + +class TestInstanceUpdate(unittest.TestCase): + _model_name = "model_init_del" + + def setUp(self): + # Reset counters + reset_count("initialize") + reset_count("finalize") + # Reset batching + disable_batching() + # Reset delays + set_delay("initialize", 0) + set_delay("infer", 0) + # Reset sequence batching + update_sequence_batching("") + # Initialize client + self._triton = grpcclient.InferenceServerClient("localhost:8001") + + def tearDown(self): + # Check if the test passed for this test case that is tearing down + r = self.defaultTestResult() + self._feedErrorsToResult(r, self._outcome.errors) + # Use `r = self._outcome.result` for the above, if Python >= 3.11 + passed = all(self != test_case for test_case, _ in r.errors + r.failures) + if passed: + # Do nothing if passed + return + # Best effort to reset the model state for the next test case + self._triton.unload_model(self._model_name) + time.sleep(30) # time for instances to finish unloading + + def _get_inputs(self, batching=False): + self.assertIsInstance(batching, bool) + if batching: + shape = [random.randint(1, 2), random.randint(1, 16)] + else: + shape = [random.randint(1, 16)] + inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")] + inputs[0].set_data_from_numpy(np.ones(shape, dtype=np.float32)) + return inputs + + def _infer(self, batching=False): + self._triton.infer(self._model_name, self._get_inputs(batching)) + + def _concurrent_infer(self, concurrency=4, batching=False): + pool = concurrent.futures.ThreadPoolExecutor() + stop = [False] + + def repeat_infer(): + while not stop[0]: + self._infer(batching) + + infer_threads = [pool.submit(repeat_infer) for i in range(concurrency)] + + def stop_infer(): + stop[0] = True + [t.result() for t in infer_threads] + pool.shutdown() + + return stop_infer + + def _check_count(self, kind, expected_count, poll=False): + self.assertIsInstance(poll, bool) + if poll: + timeout = 30 # seconds + poll_interval = 0.1 # seconds + max_retry = timeout / poll_interval + num_retry = 0 + while num_retry < max_retry and get_count(kind) < expected_count: + time.sleep(poll_interval) + num_retry += 1 + self.assertEqual(get_count(kind), expected_count) + + def _load_model(self, instance_count, instance_config="", batching=False): + # Set batching + enable_batching() if batching else disable_batching() + # Load model + self._update_instance_count( + instance_count, 0, instance_config, batching=batching + ) + + def _update_instance_count( + self, + add_count, + del_count, + instance_config="", + wait_for_finalize=False, + batching=False, + ): + self.assertIsInstance(add_count, int) + self.assertGreaterEqual(add_count, 0) + self.assertIsInstance(del_count, int) + self.assertGreaterEqual(del_count, 0) + self.assertIsInstance(instance_config, str) + prev_initialize_count = get_count("initialize") + prev_finalize_count = get_count("finalize") + new_initialize_count = prev_initialize_count + add_count + new_finalize_count = prev_finalize_count + del_count + if len(instance_config) == 0: + prev_count = prev_initialize_count - prev_finalize_count + new_count = prev_count + add_count - del_count + instance_config = "{\ncount: " + str(new_count) + "\nkind: KIND_CPU\n}" + update_instance_group(instance_config) + self._triton.load_model(self._model_name) + self._check_count("initialize", new_initialize_count) + self._check_count("finalize", new_finalize_count, wait_for_finalize) + self._infer(batching) + + def _unload_model(self, batching=False): + prev_initialize_count = get_count("initialize") + self._triton.unload_model(self._model_name) + self._check_count("initialize", prev_initialize_count) + self._check_count("finalize", prev_initialize_count, True) + with self.assertRaises(InferenceServerException): + self._infer(batching) + + # Test add -> remove -> add an instance without batching + def test_add_rm_add_instance_no_batching(self): + self._load_model(3, batching=False) + stop = self._concurrent_infer(batching=False) + self._update_instance_count(1, 0, batching=False) # add + self._update_instance_count(0, 1, batching=False) # remove + self._update_instance_count(1, 0, batching=False) # add + stop() + self._unload_model(batching=False) + + # Test add -> remove -> add an instance with batching + def test_add_rm_add_instance_with_batching(self): + self._load_model(4, batching=True) + stop = self._concurrent_infer(batching=True) + self._update_instance_count(1, 0, batching=True) # add + self._update_instance_count(0, 1, batching=True) # remove + self._update_instance_count(1, 0, batching=True) # add + stop() + self._unload_model(batching=True) + + # Test remove -> add -> remove an instance without batching + def test_rm_add_rm_instance_no_batching(self): + self._load_model(2, batching=False) + stop = self._concurrent_infer(batching=False) + self._update_instance_count(0, 1, batching=False) # remove + self._update_instance_count(1, 0, batching=False) # add + self._update_instance_count(0, 1, batching=False) # remove + stop() + self._unload_model(batching=False) + + # Test remove -> add -> remove an instance with batching + def test_rm_add_rm_instance_with_batching(self): + self._load_model(3, batching=True) + stop = self._concurrent_infer(batching=True) + self._update_instance_count(0, 1, batching=True) # remove + self._update_instance_count(1, 0, batching=True) # add + self._update_instance_count(0, 1, batching=True) # remove + stop() + self._unload_model(batching=True) + + # Test reduce instance count to zero + def test_rm_instance_to_zero(self): + self._load_model(1) + # Setting instance group count to 0 will be overwritten to 1, so no + # instances should be created or removed. + self._update_instance_count(0, 0, "{\ncount: 0\nkind: KIND_CPU\n}") + self._unload_model() + + # Test add/remove multiple CPU instances at a time + def test_cpu_instance_update(self): + self._load_model(8) + self._update_instance_count(0, 4) # remove 4 instances + self._update_instance_count(0, 3) # remove 3 instances + self._update_instance_count(0, 0) # no change + time.sleep(0.1) # larger the gap for config.pbtxt timestamp to update + self._update_instance_count(2, 0) # add 2 instances + self._update_instance_count(5, 0) # add 5 instances + self._unload_model() + + # Test add/remove multiple GPU instances at a time + def test_gpu_instance_update(self): + self._load_model(6, "{\ncount: 6\nkind: KIND_GPU\n}") + self._update_instance_count(0, 2, "{\ncount: 4\nkind: KIND_GPU\n}") + self._update_instance_count(3, 0, "{\ncount: 7\nkind: KIND_GPU\n}") + self._unload_model() + + # Test add/remove multiple CPU/GPU instances at a time + def test_gpu_cpu_instance_update(self): + # Load model with 1 GPU instance and 2 CPU instance + self._load_model( + 3, "{\ncount: 2\nkind: KIND_CPU\n},\n{\ncount: 1\nkind: KIND_GPU\n}" + ) + # Add 2 GPU instance and remove 1 CPU instance + self._update_instance_count( + 2, 1, "{\ncount: 1\nkind: KIND_CPU\n},\n{\ncount: 3\nkind: KIND_GPU\n}" + ) + # Shuffle the instances + self._update_instance_count( + 0, 0, "{\ncount: 3\nkind: KIND_GPU\n},\n{\ncount: 1\nkind: KIND_CPU\n}" + ) + time.sleep(0.1) # larger the gap for config.pbtxt timestamp to update + # Remove 1 GPU instance and add 1 CPU instance + self._update_instance_count( + 1, 1, "{\ncount: 2\nkind: KIND_GPU\n},\n{\ncount: 2\nkind: KIND_CPU\n}" + ) + # Unload model + self._unload_model() + + # Test model instance name update + def test_instance_name_update(self): + # Load 3 instances with 2 different names + self._load_model( + 3, + '{\nname: "old_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "old_2"\ncount: 2\nkind: KIND_GPU\n}', + ) + # Change the instance names + self._update_instance_count( + 0, + 0, + '{\nname: "new_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "new_2"\ncount: 2\nkind: KIND_GPU\n}', + ) + # Unload model + self._unload_model() + + # Test instance signature grouping + def test_instance_signature(self): + # Load 2 GPU instances and 3 CPU instances + self._load_model( + 5, + '{\nname: "GPU_group"\ncount: 2\nkind: KIND_GPU\n},\n{\nname: "CPU_group"\ncount: 3\nkind: KIND_CPU\n}', + ) + # Flatten the instances representation + self._update_instance_count( + 0, + 0, + '{\nname: "CPU_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_2_3"\ncount: 2\nkind: KIND_CPU\n},\n{\nname: "GPU_1"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "GPU_2"\ncount: 1\nkind: KIND_GPU\n}', + ) + time.sleep(0.1) # larger the gap for config.pbtxt timestamp to update + # Consolidate different representations + self._update_instance_count( + 0, + 0, + '{\nname: "CPU_group"\ncount: 3\nkind: KIND_CPU\n},\n{\nname: "GPU_group"\ncount: 2\nkind: KIND_GPU\n}', + ) + time.sleep(0.1) # larger the gap for config.pbtxt timestamp to update + # Flatten the instances representation + self._update_instance_count( + 0, + 0, + '{\nname: "GPU_1"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "GPU_2"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "CPU_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_2"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_3"\ncount: 1\nkind: KIND_CPU\n}', + ) + # Unload model + self._unload_model() + + # Test instance update with invalid instance group config + def test_invalid_config(self): + # Load model with 8 instances + self._load_model(8) + # Set invalid config + update_instance_group("--- invalid config ---") + with self.assertRaises(InferenceServerException): + self._triton.load_model("model_init_del") + # Correct config by reducing instances to 4 + self._update_instance_count(0, 4) + # Unload model + self._unload_model() + + # Test instance update with model file changed + def test_model_file_update(self): + self._load_model(5) + update_model_file() + self._update_instance_count( + 6, 5, "{\ncount: 6\nkind: KIND_CPU\n}", wait_for_finalize=True + ) + self._unload_model() + + # Test instance update with non instance config changed in config.pbtxt + def test_non_instance_config_update(self): + self._load_model(4, batching=False) + enable_batching() + self._update_instance_count( + 2, + 4, + "{\ncount: 2\nkind: KIND_CPU\n}", + wait_for_finalize=True, + batching=True, + ) + self._unload_model(batching=True) + + # Test passing new instance config via load API + def test_load_api_with_config(self): + # Load model with 1 instance + self._load_model(1) + # Get the model config from Triton + config = self._triton.get_model_config(self._model_name, as_json=True) + self.assertIn("config", config) + self.assertIsInstance(config["config"], dict) + config = config["config"] + self.assertIn("instance_group", config) + self.assertIsInstance(config["instance_group"], list) + self.assertEqual(len(config["instance_group"]), 1) + self.assertIn("count", config["instance_group"][0]) + self.assertIsInstance(config["instance_group"][0]["count"], int) + # Add an extra instance into the model config + config["instance_group"][0]["count"] += 1 + self.assertEqual(config["instance_group"][0]["count"], 2) + # Load the extra instance via the load API + self._triton.load_model(self._model_name, config=json.dumps(config)) + self._check_count("initialize", 2) # 2 instances in total + self._check_count("finalize", 0) # no instance is removed + self._infer() + # Unload model + self._unload_model() + + # Test instance update with an ongoing inference + def test_update_while_inferencing(self): + # Load model with 1 instance + self._load_model(1) + # Add 1 instance while inferencing + set_delay("infer", 10) + update_instance_group("{\ncount: 2\nkind: KIND_CPU\n}") + with concurrent.futures.ThreadPoolExecutor() as pool: + infer_start_time = time.time() + infer_thread = pool.submit(self._infer) + time.sleep(2) # make sure inference has started + update_start_time = time.time() + update_thread = pool.submit(self._triton.load_model, self._model_name) + update_thread.result() + update_end_time = time.time() + infer_thread.result() + infer_end_time = time.time() + infer_time = infer_end_time - infer_start_time + update_time = update_end_time - update_start_time + # Adding a new instance does not depend on existing instances, so the + # ongoing inference should not block the update. + self.assertGreaterEqual(infer_time, 10.0, "Invalid infer time") + self.assertLess(update_time, 5.0, "Update blocked by infer") + self._check_count("initialize", 2) + self._check_count("finalize", 0) + self._infer() + # Unload model + self._unload_model() + + # Test inference with an ongoing instance update + def test_infer_while_updating(self): + # Load model with 1 instance + self._load_model(1) + # Infer while adding 1 instance + set_delay("initialize", 10) + update_instance_group("{\ncount: 2\nkind: KIND_CPU\n}") + with concurrent.futures.ThreadPoolExecutor() as pool: + update_start_time = time.time() + update_thread = pool.submit(self._triton.load_model, self._model_name) + time.sleep(2) # make sure update has started + infer_start_time = time.time() + infer_thread = pool.submit(self._infer) + infer_thread.result() + infer_end_time = time.time() + update_thread.result() + update_end_time = time.time() + update_time = update_end_time - update_start_time + infer_time = infer_end_time - infer_start_time + # Waiting on new instance creation should not block inference on + # existing instances. + self.assertGreaterEqual(update_time, 10.0, "Invalid update time") + self.assertLess(infer_time, 5.0, "Infer blocked by update") + self._check_count("initialize", 2) + self._check_count("finalize", 0) + self._infer() + # Unload model + self._unload_model() + + # Test instance resource requirement increase + @unittest.skipUnless( + "execution_count" in os.environ["RATE_LIMIT_MODE"], + "Rate limiter precondition not met for this test", + ) + def test_instance_resource_increase(self): + # Load model + self._load_model( + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 2\n}\n]\n}\n}', + ) + # Increase resource requirement + self._update_instance_count( + 1, + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 8\n}\n]\n}\n}', + ) + # Check the model is not blocked from infer due to the default resource + # possibly not updated to the larger resource requirement. + infer_count = 8 + infer_complete = [False for i in range(infer_count)] + + def infer(): + for i in range(infer_count): + self._infer() + infer_complete[i] = True + + with concurrent.futures.ThreadPoolExecutor() as pool: + infer_thread = pool.submit(infer) + time.sleep(infer_count / 2) # each infer should take < 0.5 seconds + self.assertNotIn(False, infer_complete, "Infer possibly stuck") + infer_thread.result() + # Unload model + self._unload_model() + + # Test instance resource requirement increase above explicit resource + @unittest.skipUnless( + os.environ["RATE_LIMIT_MODE"] == "execution_count_with_explicit_resource", + "Rate limiter precondition not met for this test", + ) + def test_instance_resource_increase_above_explicit(self): + # Load model + self._load_model( + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 2\n}\n]\n}\n}', + ) + # Increase resource requirement + with self.assertRaises(InferenceServerException): + self._update_instance_count( + 0, + 0, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 32\n}\n]\n}\n}', + ) + # Correct the resource requirement to match the explicit resource + self._update_instance_count( + 1, + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 10\n}\n]\n}\n}', + ) + # Unload model + self._unload_model() + + # Test instance resource requirement decrease + @unittest.skipUnless( + "execution_count" in os.environ["RATE_LIMIT_MODE"], + "Rate limiter precondition not met for this test", + ) + def test_instance_resource_decrease(self): + # Load model + self._load_model( + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 4\n}\n]\n}\n}', + ) + # Decrease resource requirement + self._update_instance_count( + 1, + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 3\n}\n]\n}\n}', + ) + # Unload model + self._unload_model() + # The resource count of 3 is unique across this entire test, so check + # the server output to make sure it is printed, which ensures the + # max resource is actually decreased. + time.sleep(1) # make sure the log file is updated + log_path = os.path.join( + os.environ["MODEL_LOG_DIR"], + "instance_update_test.rate_limit_" + + os.environ["RATE_LIMIT_MODE"] + + ".server.log", + ) + with open(log_path, mode="r", encoding="utf-8", errors="strict") as f: + if os.environ["RATE_LIMIT_MODE"] == "execution_count": + # Make sure the previous max resource limit of 4 is reduced to 3 + # when no explicit limit is set. + self.assertIn("Resource: R1\\t Count: 3", f.read()) + else: + # Make sure the max resource limit is never set to 3 when + # explicit limit of 10 is set. + self.assertNotIn("Resource: R1\\t Count: 3", f.read()) + + _direct_sequence_batching_str = ( + "direct { }\nmax_sequence_idle_microseconds: 8000000" + ) + _oldest_sequence_batching_str = ( + "oldest { max_candidate_sequences: 4 }\nmax_sequence_idle_microseconds: 8000000" + ) + + # Test instance update for direct scheduler without any ongoing sequences + def test_direct_scheduler_update_no_ongoing_sequences(self): + self._test_scheduler_update_no_ongoing_sequences( + self._direct_sequence_batching_str + ) + + # Test instance update for direct scheduler with any ongoing sequences + def test_direct_scheduler_update_with_ongoing_sequences(self): + self._test_scheduler_update_with_ongoing_sequences( + self._direct_sequence_batching_str + ) + + # Test instance update for oldest scheduler without ongoing sequences + def test_oldest_scheduler_update_no_ongoing_sequences(self): + self._test_scheduler_update_no_ongoing_sequences( + self._oldest_sequence_batching_str + ) + + # Test instance update for oldest scheduler with ongoing sequences + def test_oldest_scheduler_update_with_ongoing_sequences(self): + self._test_scheduler_update_with_ongoing_sequences( + self._oldest_sequence_batching_str + ) + + # Helper function for testing the success of sequence instance updates + # without any ongoing sequences. + def _test_scheduler_update_no_ongoing_sequences(self, sequence_batching_str): + # Load model + update_instance_group("{\ncount: 2\nkind: KIND_CPU\n}") + update_sequence_batching(sequence_batching_str) + self._triton.load_model(self._model_name) + self._check_count("initialize", 2) + self._check_count("finalize", 0) + # Basic sequence inference + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_start=True + ) + self._triton.infer(self._model_name, self._get_inputs(), sequence_id=1) + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_end=True + ) + # Add 2 instances without in-flight sequence + update_instance_group("{\ncount: 4\nkind: KIND_CPU\n}") + self._triton.load_model(self._model_name) + self._check_count("initialize", 4) + self._check_count("finalize", 0) + # Basic sequence inference + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_start=True + ) + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_end=True + ) + # Remove 1 instance without in-flight sequence + update_instance_group("{\ncount: 3\nkind: KIND_CPU\n}") + self._triton.load_model(self._model_name) + self._check_count("initialize", 4) + self._check_count("finalize", 1, poll=True) + # Basic sequence inference + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_start=True + ) + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_end=True + ) + # Unload model + self._triton.unload_model(self._model_name) + self._check_count("initialize", 4) + self._check_count("finalize", 4, poll=True) + + # Helper function for testing if ongoing sequences may continue to infer on + # the same instance after the instance processing the sequence is removed + # from an instance update, which the removed instance will live until the + # sequences end. + def _test_scheduler_update_with_ongoing_sequences(self, sequence_batching_str): + # Load model + update_instance_group("{\ncount: 3\nkind: KIND_CPU\n}") + update_sequence_batching(sequence_batching_str) + self._triton.load_model(self._model_name) + self._check_count("initialize", 3) + self._check_count("finalize", 0) + # Start sequence 1 and 2 on CPU instances + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_start=True + ) + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=2, sequence_start=True + ) + # Remove all 3 CPU and add 1 GPU instance with in-flight sequences. Both + # in-flight sequences are assigned to any 2 CPU instances, so exactly 1 + # CPU instance can be removed immediately. + update_instance_group("{\ncount: 1\nkind: KIND_GPU\n}") + self._triton.load_model(self._model_name) + self._check_count("initialize", 4) # 3 CPU + 1 GPU + self._check_count("finalize", 1, poll=True) # 1 CPU + # Sequence 1 and 2 may continue to infer + self._triton.infer(self._model_name, self._get_inputs(), sequence_id=1) + self._triton.infer(self._model_name, self._get_inputs(), sequence_id=2) + self._check_count("finalize", 1) # check 2 CPU instances not removed + # Start sequence 3 on GPU instance + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=3, sequence_start=True + ) + self._check_count("finalize", 1) # check 2 CPU instances not removed + # End sequence 1 and 2 will remove the 2 CPU instances + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=1, sequence_end=True + ) + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=2, sequence_end=True + ) + self._check_count("finalize", 3, poll=True) # 3 CPU + # End sequence 3 + self._triton.infer( + self._model_name, self._get_inputs(), sequence_id=3, sequence_end=True + ) + # Unload model + self._triton.unload_model(self._model_name) + self._check_count("initialize", 4) # 3 CPU + 1 GPU + self._check_count("finalize", 4, poll=True) # 3 CPU + 1 GPU + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_model_update/test.sh b/qa/L0_model_update/test.sh new file mode 100755 index 0000000000..aa9cf7fcc1 --- /dev/null +++ b/qa/L0_model_update/test.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# This L0_model_update test should make changes to models without restarting the +# server, unless restarting the server is the only way of accomplishing the +# change. + +export CUDA_VISIBLE_DEVICES=0 +export PYTHONDONTWRITEBYTECODE="True" +export MODEL_LOG_DIR="`pwd`" + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +function setup_models() { + rm -rf models && mkdir models + # Basic model that log instance creation and destruction + cp -r ../python_models/model_init_del models/model_init_del && \ + mkdir models/model_init_del/1 && \ + mv models/model_init_del/model.py models/model_init_del/1 +} + +RET=0 + +# Test model instance update with rate limiting on/off and explicit resource +for RATE_LIMIT_MODE in "off" "execution_count" "execution_count_with_explicit_resource"; do + + RATE_LIMIT_ARGS="--rate-limit=$RATE_LIMIT_MODE" + if [ "$RATE_LIMIT_MODE" == "execution_count_with_explicit_resource" ]; then + RATE_LIMIT_ARGS="--rate-limit=execution_count --rate-limit-resource=R1:10" + fi + + export RATE_LIMIT_MODE=$RATE_LIMIT_MODE + TEST_LOG="instance_update_test.rate_limit_$RATE_LIMIT_MODE.log" + SERVER_LOG="./instance_update_test.rate_limit_$RATE_LIMIT_MODE.server.log" + + setup_models + SERVER_ARGS="--model-repository=models --model-control-mode=explicit $RATE_LIMIT_ARGS --log-verbose=2" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + python instance_update_test.py > $TEST_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed model instance update test on rate limit mode $RATE_LIMIT_MODE\n***" + cat $TEST_LOG + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + set +e + grep "Should not print this" $SERVER_LOG + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Found \"Should not print this\" on \"$SERVER_LOG\"\n***" + cat $SERVER_LOG + RET=1 + fi + set -e + +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_multi_server/test.sh b/qa/L0_multi_server/test.sh new file mode 100755 index 0000000000..cd5ff3d407 --- /dev/null +++ b/qa/L0_multi_server/test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +MODELSDIR=`pwd`/models +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository + +export CUDA_VISIBLE_DEVICES=0 + +# Must explicitly set LD_LIBRARY_PATH so that server can find +# libtritonserver.so. +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH + +rm -f *.log && rm -rf ${MODELSDIR}* + +RET=0 + +MULTI_SERVER=multi_server +CLIENT_LOG=$MULTI_SERVER +MULTI_SERVER=./$MULTI_SERVER +BACKENDS=(graphdef onnx plan) +THREAD_COUNT=32 +LOOPS=32 + +EXTRA_ARGS=" -t ${THREAD_COUNT} -l ${LOOPS}" +for (( I=1; I<${THREAD_COUNT}+2; I++ )); do + BACKEND_INDEX=$(((I % 3) - 1)) + full=${BACKENDS[$BACKEND_INDEX]}_float32_float32_float32 + mkdir -p ${MODELSDIR}${I}/simple${I}/1 && \ + cp -r $DATADIR/${full}/1/* ${MODELSDIR}${I}/simple${I}/1/. && \ + cp $DATADIR/${full}/config.pbtxt ${MODELSDIR}${I}/simple${I}/. && \ + (cd ${MODELSDIR}${I}/simple${I} && \ + sed -i "s/^name:.*/name: \"simple${I}\"/" config.pbtxt && \ + sed -i "s/label_filename:.*//" config.pbtxt) + EXTRA_ARGS="${EXTRA_ARGS} -r ${MODELSDIR}${I}" +done + +set +e + +# No memory type enforcement +$MULTI_SERVER ${EXTRA_ARGS} >>$CLIENT_LOG.log 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_nan_inf/models/nan_inf_output/1/model.py b/qa/L0_nan_inf/models/nan_inf_output/1/model.py new file mode 100644 index 0000000000..17cfb04fa0 --- /dev/null +++ b/qa/L0_nan_inf/models/nan_inf_output/1/model.py @@ -0,0 +1,47 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for _ in requests: + # Include one of each specially parsed JSON value: nan, inf, and -inf + out_0 = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], dtype=np.float32) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + + return responses diff --git a/qa/L0_nan_inf/models/nan_inf_output/config.pbtxt b/qa/L0_nan_inf/models/nan_inf_output/config.pbtxt new file mode 100644 index 0000000000..75071bbad0 --- /dev/null +++ b/qa/L0_nan_inf/models/nan_inf_output/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "nan_inf_output" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 6 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/L0_nan_inf/nan_inf_test.py b/qa/L0_nan_inf/nan_inf_test.py new file mode 100755 index 0000000000..3013b03850 --- /dev/null +++ b/qa/L0_nan_inf/nan_inf_test.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json +import traceback +import unittest + +import numpy as np +import requests +import test_util as tu +import tritonclient.grpc as tritongrpcclient +import tritonclient.http as tritonhttpclient +from tritonclient.utils import InferenceServerException + + +class NanInfTest(tu.TestResultCollector): + expected_output = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], dtype=np.float32) + model_name = "nan_inf_output" + + def test_http_raw(self): + payload = { + "inputs": [ + {"name": "INPUT0", "datatype": "FP32", "shape": [1], "data": [1]} + ] + } + response = requests.post( + "http://localhost:8000/v2/models/nan_inf_output/infer", + data=json.dumps(payload), + ) + if not response.ok: + self.assertTrue(False, "Response not OK: {}".format(response.text)) + + try: + print(response.json()) + except: + self.assertTrue( + False, "Response was not valid JSON:\n{}".format(response.text) + ) + + def test_http(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT0", [1], "FP32")) + self.infer_helper(triton_client, inputs) + + def test_grpc(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + inputs.append(tritongrpcclient.InferInput("INPUT0", [1], "FP32")) + self.infer_helper(triton_client, inputs) + + def infer_helper(self, triton_client, inputs): + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.float32)) + + try: + results = triton_client.infer(model_name=self.model_name, inputs=inputs) + output0_data = results.as_numpy("OUTPUT0") + # Verify output is as expected + # Make sure nan's are equivalent when compared + output_correct = np.array_equal( + output0_data, self.expected_output, equal_nan=True + ) + self.assertTrue( + output_correct, "didn't get expected output0: {}".format(output0_data) + ) + except InferenceServerException as ex: + self.assertTrue(False, ex.message()) + except: + self.assertTrue(False, traceback.format_exc()) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_nan_inf/test.sh b/qa/L0_nan_inf/test.sh new file mode 100755 index 0000000000..0e778966a4 --- /dev/null +++ b/qa/L0_nan_inf/test.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +CLIENT_LOG="./nan_inf_client.log" +TEST_PY=./nan_inf_test.py +EXPECTED_NUM_TESTS="3" +TEST_RESULT_FILE='test_results.txt' + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 $TEST_PY >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_nullchar_string/nullchar_string_client.py b/qa/L0_nullchar_string/nullchar_string_client.py new file mode 100755 index 0000000000..1ab76bcf03 --- /dev/null +++ b/qa/L0_nullchar_string/nullchar_string_client.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-m", "--model-name", type=str, required=True, help="Name of model" + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + + FLAGS = parser.parse_args() + + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + # Create the inference context for the model. + client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) + + # We use identity string models that takes 1 input tensor of a single string + # and returns 1 output tensor of a single string. The output tensor is the + # same as the input tensor. + batch_size = 1 + + # Create the data for the input tensor. It contains a null character in + # the middle of the string. + tmp_str = "abc\0def" + input0_data = np.array([tmp_str], dtype=object) + + # Send inference request to the inference server. Get results for + # output tensor. + inputs = [ + client_util.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(np.object_) + ) + ] + inputs[0].set_data_from_numpy(input0_data) + + results = client.infer(FLAGS.model_name, inputs) + + # We expect there to be 1 result (with batch-size 1). Compare the input + # and output tensor calculated by the model. They must be the same. + output0_data = results.as_numpy("OUTPUT0") + + print(input0_data, "?=?", output0_data) + assert np.equal(input0_data.astype(np.bytes_), output0_data).all() diff --git a/qa/L0_nullchar_string/test.sh b/qa/L0_nullchar_string/test.sh new file mode 100755 index 0000000000..bded41dc92 --- /dev/null +++ b/qa/L0_nullchar_string/test.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository +MODELS="graphdef_nobatch_zero_1_object savedmodel_nobatch_zero_1_object" +NULLCHAR_CLIENT_PY=nullchar_string_client.py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f $CLIENT_LOG $SERVER_LOG models + +mkdir -p models +for MODEL in $MODELS; do + cp -r $DATADIR/$MODEL models/. +done + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +set +e + +# Ignore ONNX backend because even though ONNX supports string data type, +# strings that contain null character in the middle is not allowed. +# https://github.com/microsoft/onnxruntime/issues/2284 +for MODEL in $MODELS; do + python $NULLCHAR_CLIENT_PY -m $MODEL -v >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + python $NULLCHAR_CLIENT_PY -m $MODEL -i grpc -u localhost:8001 -v >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_onnx_optimization/test.sh b/qa/L0_onnx_optimization/test.sh new file mode 100755 index 0000000000..3657d95ed1 --- /dev/null +++ b/qa/L0_onnx_optimization/test.sh @@ -0,0 +1,248 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION} + +CLIENT_LOG="./client.log" +ONNXTRT_OPTIMIZATION_TEST=onnxtrt_optimization_test.py + +SERVER=/opt/tritonserver/bin/tritonserver +CACHE_PATH=`pwd`/trt_cache +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1 --exit-on-error=false" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +RET=0 + +for MODEL in \ + onnx_float32_float32_float32; do + rm -f ./*.log + rm -fr models && mkdir -p models + cp -r $DATADIR/qa_model_repository/${MODEL} \ + models/${MODEL}_test && \ + rm -fr models/${MODEL}_test/2 && \ + rm -fr models/${MODEL}_test/3 && \ + # Set instance count > 1 to test parallel instance loading across all EPs + INSTANCE_COUNT=5 + (cd models/${MODEL}_test && \ + sed -i 's/_float32_float32_float32/&_test/' config.pbtxt && \ + echo -e "\ninstance_group { count: ${INSTANCE_COUNT} }" >> config.pbtxt) && \ + # CUDA EP optimization params + cp -r models/${MODEL}_test models/${MODEL}_cuda_config && \ + (cd models/${MODEL}_cuda_config && \ + sed -i 's/_float32_test/_float32_cuda_config/' \ + config.pbtxt && \ + echo "parameters: { key: \"cudnn_conv_algo_search\" value: { string_value: \"1\" }} \ + parameters: { key: \"arena_extend_strategy\" value: { string_value: \"1\" }} + parameters: { key: \"gpu_mem_limit\" value: { string_value: \"18446744073709551614\" }} " \ >> config.pbtxt) && \ + # CUDA EP optimization params specified in gpu_execution_accelerator field + cp -r models/${MODEL}_test models/${MODEL}_cuda_param_field && \ + (cd models/${MODEL}_cuda_param_field && \ + sed -i 's/_float32_test/_float32_cuda_param_field/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"cuda\" \ + parameters { key: \"cudnn_conv_use_max_workspace\" value: \"0\" } \ + parameters { key: \"use_ep_level_unified_stream\" value: \"1\" } }]}}" \ + >> config.pbtxt) && \ + # CPU EP optimization params + cp -r models/${MODEL}_test models/${MODEL}_cpu_config && \ + (cd models/${MODEL}_cpu_config && \ + sed -i 's/_float32_test/_float32_cpu_config/' \ + config.pbtxt && \ + echo "parameters: { key: \"intra_op_thread_count\" value: { string_value: \"1\" }} \ + parameters: { key: \"enable_mem_arena\" value: { string_value: \"1\" }} + parameters: { key: \"enable_mem_pattern\" value: { string_value: \"1\" }} + parameters: { key: \"memory.enable_memory_arena_shrinkage\" value: { string_value: \"cpu:0\" }} " \ >> config.pbtxt) && \ + # GPU execution accelerators with default setting + cp -r models/${MODEL}_test models/${MODEL}_trt && \ + (cd models/${MODEL}_trt && \ + sed -i 's/_float32_test/_float32_trt/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\"} ] } }" >> config.pbtxt) && \ + # TRT execution accelerators with correct parameters + cp -r models/${MODEL}_test models/${MODEL}_trt_param && \ + (cd models/${MODEL}_trt_param && \ + sed -i 's/_float32_test/_float32_trt_param/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\" \ + parameters { key: \"precision_mode\" value: \"FP16\" } \ + parameters { key: \"trt_max_partition_iterations\" value: \"1000\" } \ + parameters { key: \"trt_dump_subgraphs\" value: \"1\" } \ + parameters { key: \"trt_timing_cache_enable\" value: \"1\" } \ + parameters { key: \"trt_build_heuristics_enable\" value: \"1\" } \ + parameters { key: \"trt_cuda_graph_enable\" value: \"1\" } \ + parameters { key: \"max_workspace_size_bytes\" value: \"1073741824\" } }]}}" \ + >> config.pbtxt) && \ + # TRT execution accelerators with cache enabled + cp -r models/${MODEL}_test models/${MODEL}_trt_cache_on && \ + (cd models/${MODEL}_trt_cache_on && \ + sed -i 's/_float32_test/_float32_trt_cache_on/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\" \ + parameters { key: \"trt_engine_cache_enable\" value: \"1\" } \ + parameters { key: \"trt_max_partition_iterations\" value: \"1000\" } \ + parameters { key: \"trt_dump_subgraphs\" value: \"1\" } \ + parameters { key: \"trt_timing_cache_enable\" value: \"1\" } \ + parameters { key: \"trt_build_heuristics_enable\" value: \"1\" } \ + parameters { key: \"trt_cuda_graph_enable\" value: \"1\" } \ + parameters { key: \"trt_engine_cache_path\" value: \"${CACHE_PATH}\" } }]}}" \ + >> config.pbtxt) && \ + # TRT execution accelerators with unknown parameters + cp -r models/${MODEL}_test models/${MODEL}_trt_unknown_param && \ + (cd models/${MODEL}_trt_unknown_param && \ + sed -i 's/_float32_test/_float32_trt_unknown_param/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\" \ + parameters { key: \"precision_mode\" value: \"FP16\" } \ + parameters { key: \"segment_size\" value: \"1\" } }]}}" \ + >> config.pbtxt) && \ + # TRT execution accelerators with invalid parameters + cp -r models/${MODEL}_test models/${MODEL}_trt_invalid_param && \ + (cd models/${MODEL}_trt_invalid_param && \ + sed -i 's/_float32_test/_float32_trt_invalid_param/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\" \ + parameters { key: \"precision_mode\" value: \"FP16\" } \ + parameters { key: \"max_workspace_size_bytes\" value: \"abc\" } }]}}" \ + >> config.pbtxt) && \ + # Unknown GPU execution accelerator + cp -r models/${MODEL}_test models/${MODEL}_unknown_gpu && \ + (cd models/${MODEL}_unknown_gpu && \ + sed -i 's/_float32_test/_float32_unknown_gpu/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"unknown_gpu\" } ] } }" >> config.pbtxt) && \ + + run_server_tolive + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + grep "TensorRT Execution Accelerator is set for '${MODEL}_trt'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected TensorRT Execution Accelerator is set for '${MODEL}_trt'\n***" + RET=1 + fi + + grep "TensorRT Execution Accelerator is set for '${MODEL}_trt_param'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected TensorRT Execution Accelerator is set for '${MODEL}_trt_param'\n***" + RET=1 + fi + + grep "TensorRT Execution Accelerator is set for '${MODEL}_trt_cache_on'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected TensorRT Execution Accelerator is set for '${MODEL}_trt_cache_on'\n***" + RET=1 + fi + + grep "failed to load '${MODEL}_trt_unknown_param' version 1: Invalid argument: unknown parameter 'segment_size' is provided for TensorRT Execution Accelerator" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected unknown parameter 'segment_size' returns error\n***" + RET=1 + fi + + grep "failed to load '${MODEL}_trt_invalid_param' version 1: Invalid argument: failed to convert 'abc' to unsigned long long integral number" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected invalid parameter 'abc' returns error\n***" + RET=1 + fi + + grep "failed to load '${MODEL}_unknown_gpu' version 1: Invalid argument: unknown Execution Accelerator 'unknown_gpu' is requested" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'unknown_gpu' Execution Accelerator returns error\n***" + RET=1 + fi + + grep "memory limit: 18446744073709551614 arena_extend_strategy: 1" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected configurations not set for '${MODEL}_cuda_config'\n***" + RET=1 + fi + + grep "CUDA Execution Accelerator is set for '${MODEL}_cpu_config'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected CUDA Execution Accelerator is set for '${MODEL}_cpu_config'\n***" + RET=1 + fi + + matched_line=$(grep "CUDA Execution Accelerator is set for 'onnx_float32_float32_float32_cuda_param_field'" $SERVER_LOG) + if [[ "$matched_line" != *"use_ep_level_unified_stream=1"* ]] || [[ "$matched_line" != *"cudnn_conv_use_max_workspace=0"* ]]; then + echo -e "\n***\n*** Failed. Expected CUDA Execution Accelerator options correctly set for '${MODEL}_cuda_param_field'\n***" + RET=1 + fi + + # arena configs + grep "Configuring enable_mem_arena to 1" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected Configuring enable_mem_arena to 1\n***" + RET=1 + fi + + grep "Configuring enable_mem_pattern to 1" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected Configuring enable_mem_pattern to 1\n***" + RET=1 + fi + + grep "Configuring memory.enable_memory_arena_shrinkage to cpu:0" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected Configuring memory.enable_memory_arena_shrinkage to cpu:0\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt new file mode 100644 index 0000000000..e0dfcd2b48 --- /dev/null +++ b/qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt @@ -0,0 +1,79 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble_identity_2_float32" +platform: "ensemble" +max_batch_size: 4 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "identity_2_float32" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_optional_input/models/identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/identity_2_float32/config.pbtxt new file mode 100644 index 0000000000..37d15089e1 --- /dev/null +++ b/qa/L0_optional_input/models/identity_2_float32/config.pbtxt @@ -0,0 +1,56 @@ +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_2_float32" +backend: "identity" +max_batch_size: 4 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +dynamic_batching { preferred_batch_size: [4], max_queue_delay_microseconds: 5000000 } diff --git a/qa/L0_optional_input/models/optional_connecting_tensor/config.pbtxt b/qa/L0_optional_input/models/optional_connecting_tensor/config.pbtxt new file mode 100644 index 0000000000..afc4ebc00f --- /dev/null +++ b/qa/L0_optional_input/models/optional_connecting_tensor/config.pbtxt @@ -0,0 +1,98 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 4 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "optional_identity" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "internal_output0" + } + output_map { + key: "OUTPUT1" + value: "internal_output1" + } + }, + { + model_name: "optional_identity" + model_version: -1 + input_map { + key: "INPUT0" + value: "internal_output0" + } + input_map { + key: "INPUT1" + value: "internal_output1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_optional_input/models/optional_identity/1/model.py b/qa/L0_optional_input/models/optional_identity/1/model.py new file mode 100644 index 0000000000..c736ecc3bd --- /dev/null +++ b/qa/L0_optional_input/models/optional_identity/1/model.py @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Identity model in Python backend. + """ + responses = [] + for request in requests: + for tidx in ("0", "1"): + input_tensor = pb_utils.get_input_tensor_by_name( + request, "INPUT" + tidx + ) + if input_tensor is not None: + out_tensor = pb_utils.Tensor( + "OUTPUT" + tidx, input_tensor.as_numpy() + ) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/L0_optional_input/models/optional_identity/config.pbtxt b/qa/L0_optional_input/models/optional_identity/config.pbtxt new file mode 100644 index 0000000000..0c73fd7ca5 --- /dev/null +++ b/qa/L0_optional_input/models/optional_identity/config.pbtxt @@ -0,0 +1,53 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +backend: "python" +max_batch_size: 4 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt new file mode 100644 index 0000000000..58e867482d --- /dev/null +++ b/qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt @@ -0,0 +1,91 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "pipeline_identity_2_float32" +platform: "ensemble" +max_batch_size: 4 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "identity_2_float32" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "internal_output" + } + }, + { + model_name: "identity_2_float32" + model_version: -1 + input_map { + key: "INPUT1" + value: "internal_output" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_optional_input/optional_input_test.py b/qa/L0_optional_input/optional_input_test.py new file mode 100755 index 0000000000..c1fd114d6b --- /dev/null +++ b/qa/L0_optional_input/optional_input_test.py @@ -0,0 +1,445 @@ +#!/usr/bin/python + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import sys +import threading +import time +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient + +_deferred_exceptions_lock = threading.Lock() +_deferred_exceptions = [] + + +# Similar set up as dynamic batcher tests +class OptionalInputTest(tu.TestResultCollector): + def setUp(self): + global _deferred_exceptions + _deferred_exceptions = [] + + # The helper client for setup will be GRPC for simplicity. + self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001") + self.model_name_ = "identity_2_float32" + # This will not be changed even when ensemble is under test, + # as the dynamic batching is performed within the composing model + self.check_status_model = "identity_2_float32" + self.tensor_shape_ = (1, 1) + self.inputs_ = { + "INPUT0": grpcclient.InferInput("INPUT0", [1, 1], "FP32"), + "INPUT1": grpcclient.InferInput("INPUT1", [1, 1], "FP32"), + } + self.input_data_ = { + "INPUT0": np.ones(shape=(1, 1), dtype=np.float32), + "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32), + } + self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"]) + self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"]) + self.outputs_ = { + "INPUT0": grpcclient.InferRequestedOutput("OUTPUT0"), + "INPUT1": grpcclient.InferRequestedOutput("OUTPUT1"), + } + + def add_deferred_exception(self, ex): + global _deferred_exceptions + with _deferred_exceptions_lock: + _deferred_exceptions.append(ex) + + def check_deferred_exception(self): + # Just raise one of the exceptions... + with _deferred_exceptions_lock: + if len(_deferred_exceptions) > 0: + raise _deferred_exceptions[0] + + def check_response(self, thresholds, provided_inputs=("INPUT0", "INPUT1")): + try: + start_ms = int(round(time.time() * 1000)) + + inputs = [] + outputs = [] + for provided_input in provided_inputs: + inputs.append(self.inputs_[provided_input]) + outputs.append(self.outputs_[provided_input]) + + triton_client = grpcclient.InferenceServerClient("localhost:8001") + results = triton_client.infer( + model_name=self.model_name_, inputs=inputs, outputs=outputs + ) + + end_ms = int(round(time.time() * 1000)) + + for provided_input in provided_inputs: + output_name = self.outputs_[provided_input].name() + expected = self.input_data_[provided_input] + output_data = results.as_numpy(output_name) + self.assertTrue( + np.array_equal(output_data, expected), + "{}, {}, expected: {}, got {}".format( + self.model_name_, output_name, expected, output_data + ), + ) + + gt_ms = thresholds[0] + lt_ms = thresholds[1] + if lt_ms is not None: + self.assertTrue( + (end_ms - start_ms) < lt_ms, + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + if gt_ms is not None: + self.assertTrue( + (end_ms - start_ms) > gt_ms, + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + except Exception as ex: + self.add_deferred_exception(ex) + + def check_status(self, model_name, batch_exec, request_cnt, infer_cnt): + # There is a time window between when responses are returned and statistics are updated. + # To prevent intermittent test failure during that window, wait up to 10 seconds for the + # inference statistics to be ready. + num_tries = 10 + for i in range(num_tries): + stats = self.triton_client_.get_inference_statistics(model_name, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + actual_exec_cnt = stats.model_stats[0].execution_count + if stats.model_stats[0].execution_count > 0: + break + time.sleep(1) + + self.assertEqual( + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) + + batch_stats = stats.model_stats[0].batch_stats + self.assertEqual( + len(batch_stats), + len(batch_exec), + "expected {} different batch-sizes, got {}".format( + len(batch_exec), len(batch_stats) + ), + ) + + for batch_stat in batch_stats: + bs = batch_stat.batch_size + bc = batch_stat.compute_infer.count + self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs)) + # Get count from one of the stats + self.assertEqual( + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) + + actual_request_cnt = stats.model_stats[0].inference_stats.success.count + self.assertEqual( + actual_request_cnt, + request_cnt, + "expected model-request-count {}, got {}".format( + request_cnt, actual_request_cnt + ), + ) + + actual_exec_cnt = stats.model_stats[0].execution_count + self.assertEqual( + actual_request_cnt, + request_cnt, + "expected model-exec-count {}, got {}".format(request_cnt, actual_exec_cnt), + ) + + actual_infer_cnt = stats.model_stats[0].inference_count + self.assertEqual( + actual_infer_cnt, + infer_cnt, + "expected model-inference-count {}, got {}".format( + infer_cnt, actual_infer_cnt + ), + ) + + def test_all_inputs(self): + # Provide all inputs, send requests that don't form preferred batch + # so all requests should be returned after the queue delay + try: + threads = [] + threads.append( + threading.Thread(target=self.check_response, args=((4000, None),)) + ) + threads.append( + threading.Thread(target=self.check_response, args=((4000, None),)) + ) + threads[0].start() + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(self.check_status_model, {2: 1}, 2, 2) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_optional_same_input(self): + # Provide only one of the inputs, send requests that don't form + # preferred batch so all requests should be returned after + # the queue delay + try: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=((4000, None),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=((4000, None),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) + threads[0].start() + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(self.check_status_model, {2: 1}, 2, 2) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_optional_mix_inputs(self): + # Each request provides one of the inputs interleavingly, + # all requests except the last one should be returned in less + # than the queue delay because batcher should send the batch immediately + # when it sees the provided inputs are different + try: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) + + threads.append( + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=((4000, None),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) + for t in threads: + t.start() + time.sleep(0.5) + + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(self.check_status_model, {1: 4}, 4, 4) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_optional_mix_inputs_2(self): + # Each request provides one of the inputs or all inputs interleavingly, + # all requests except the last one should be returned in less + # than the queue delay because batcher should send the batch immediately + # when it sees the provided inputs are different + try: + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) + threads.append( + threading.Thread(target=self.check_response, args=((0, 4000),)) + ) + + threads.append( + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) + threads.append( + threading.Thread(target=self.check_response, args=((4000, None),)) + ) + for t in threads: + t.start() + time.sleep(0.5) + + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(self.check_status_model, {1: 4}, 4, 4) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_ensemble_all_inputs(self): + # The ensemble is only a wrapper over 'identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" + self.test_all_inputs() + # From the ensemble's perspective, the requests are processed as it is + self.check_status(self.model_name_, {1: 2}, 2, 2) + + def test_ensemble_optional_same_input(self): + # The ensemble is only a wrapper over 'identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" + self.test_optional_same_input() + # From the ensemble's perspective, the requests are processed as it is + self.check_status(self.model_name_, {1: 2}, 2, 2) + + def test_ensemble_optional_mix_inputs(self): + # The ensemble is only a wrapper over 'identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" + self.test_optional_mix_inputs() + # From the ensemble's perspective, the requests are processed as it is + self.check_status(self.model_name_, {1: 4}, 4, 4) + + def test_ensemble_optional_mix_inputs_2(self): + # The ensemble is only a wrapper over 'identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" + self.test_optional_mix_inputs_2() + # From the ensemble's perspective, the requests are processed as it is + self.check_status(self.model_name_, {1: 4}, 4, 4) + + def test_ensemble_optional_pipeline(self): + # The ensemble is a special case of pipelining models with optional + # inputs, where the ensemble step only connects a subset of inputs + # for the second model (which is valid because the disconnected inputs + # are marked optional). See 'config.pbtxt' for detail. + self.model_name_ = "pipeline_identity_2_float32" + + # Provide all inputs, send requests that don't form preferred batch + # so all requests should be returned after the queue delay + try: + provided_inputs = ("INPUT0", "INPUT1") + inputs = [] + for provided_input in provided_inputs: + inputs.append(self.inputs_[provided_input]) + + triton_client = grpcclient.InferenceServerClient("localhost:8001") + results = triton_client.infer(model_name=self.model_name_, inputs=inputs) + + # OUTPU0 is always zero, OUTPUT1 = INPUT0 + output_data = results.as_numpy("OUTPUT0") + expected = np.zeros(shape=(1, 1), dtype=np.float32) + self.assertTrue( + np.array_equal(output_data, expected), + "{}, {}, expected: {}, got {}".format( + self.model_name_, "OUTPUT0", expected, output_data + ), + ) + + expected = self.input_data_["INPUT0"] + output_data = results.as_numpy("OUTPUT1") + self.assertTrue( + np.array_equal(output_data, expected), + "{}, {}, expected: {}, got {}".format( + self.model_name_, "OUTPUT1", expected, output_data + ), + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_ensemble_optional_connecting_tensor(self): + # The ensemble is a special case of pipelining models with optional + # inputs, where the request will only produce a subset of inputs + # for the second model while the ensemble graph connects all inputs of + # the second model (which is valid because the not-provided inputs + # are marked optional). See 'config.pbtxt' for detail. + self.model_name_ = "optional_connecting_tensor" + + # Provide all inputs, send requests that don't form preferred batch + # so all requests should be returned after the queue delay + try: + provided_inputs = ("INPUT0",) + inputs = [] + outputs = [] + for provided_input in provided_inputs: + inputs.append(self.inputs_[provided_input]) + outputs.append(self.outputs_[provided_input]) + + triton_client = grpcclient.InferenceServerClient("localhost:8001") + results = triton_client.infer( + model_name=self.model_name_, inputs=inputs, outputs=outputs + ) + + expected = self.input_data_["INPUT0"] + output_data = results.as_numpy("OUTPUT0") + self.assertTrue( + np.array_equal(output_data, expected), + "{}, {}, expected: {}, got {}".format( + self.model_name_, "OUTPUT0", expected, output_data + ), + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_optional_input/test.sh b/qa/L0_optional_input/test.sh new file mode 100755 index 0000000000..8bfd113d32 --- /dev/null +++ b/qa/L0_optional_input/test.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +TEST_PY=./optional_input_test.py +TEST_LOG="./test.log" +TEST_RESULT_FILE='test_results.txt' + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -fr *.log + +mkdir -p ./models/identity_2_float32/1 +mkdir -p ./models/ensemble_identity_2_float32/1 +mkdir -p ./models/pipeline_identity_2_float32/1 +mkdir -p ./models/optional_connecting_tensor/1 + +# Basic test cases +TEST_CASES=${TEST_CASES:="test_all_inputs \ + test_optional_same_input \ + test_optional_mix_inputs \ + test_optional_mix_inputs_2 \ + test_ensemble_all_inputs \ + test_ensemble_optional_same_input \ + test_ensemble_optional_mix_inputs \ + test_ensemble_optional_mix_inputs_2 \ + test_ensemble_optional_pipeline \ + test_ensemble_optional_connecting_tensor"} +RET=0 +for i in $TEST_CASES ; do + # Restart server for every test to clear model stats + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$TEST_LOG + + set +e + python $TEST_PY OptionalInputTest.$i >>$TEST_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $SERVER_LOG + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_output_name/output_name_test.py b/qa/L0_output_name/output_name_test.py new file mode 100755 index 0000000000..905174640c --- /dev/null +++ b/qa/L0_output_name/output_name_test.py @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import test_util as tu +from tritongrpcclient import grpc_service_pb2, grpc_service_pb2_grpc + +import grpc + +_trials = ("graphdef", "libtorch", "onnx", "plan", "savedmodel") + + +class OutputNameValidationTest(tu.TestResultCollector): + def requestGenerator(self, model_name, output_name): + request = grpc_service_pb2.ModelInferRequest() + request.model_name = model_name + request.id = "output name validation" + + input = grpc_service_pb2.ModelInferRequest().InferInputTensor() + input.name = "INPUT0" + input.datatype = "FP32" + input.shape.extend([1]) + + request.inputs.extend([input]) + + output = grpc_service_pb2.ModelInferRequest().InferRequestedOutputTensor() + output.name = output_name + request.outputs.extend([output]) + + request.raw_input_contents.extend([bytes(4 * "a", "utf-8")]) + + return request + + def test_grpc(self): + channel = grpc.insecure_channel("localhost:8001") + grpc_stub = grpc_service_pb2_grpc.GRPCInferenceServiceStub(channel) + + # Send request with invalid output name + for trial in _trials: + model_name = "{}_nobatch_zero_1_float32".format(trial) + request = self.requestGenerator(model_name, "DUMMY") + try: + response = grpc_stub.ModelInfer(request) + self.assertTrue( + False, "unexpected success for unknown output " + model_name + ) + except grpc.RpcError as rpc_error: + msg = rpc_error.details() + self.assertTrue( + msg.startswith("unexpected inference output 'DUMMY' for model") + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_output_name/test.sh b/qa/L0_output_name/test.sh new file mode 100755 index 0000000000..7c1a5664a0 --- /dev/null +++ b/qa/L0_output_name/test.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +OP_NAME_TEST_PY=output_name_test.py +CLIENT_LOG="./client.log" +EXPECTED_NUM_TESTS="1" +DATADIR=`pwd`/models + +rm -rf $DATADIR +mkdir $DATADIR + +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/*_nobatch_zero_1_float32 $DATADIR + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f $SERVER_LOG $CLIENT_LOG + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +# test gRPC for output name validation +set +e +python $OP_NAME_TEST_PY OutputNameValidationTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test PASSED\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_output_validation/lt_op_val_client.py b/qa/L0_output_validation/lt_op_val_client.py new file mode 100755 index 0000000000..77b5a16e3f --- /dev/null +++ b/qa/L0_output_validation/lt_op_val_client.py @@ -0,0 +1,73 @@ +#!/usr/bin/python + +# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import requests +import test_util as tu + + +class OutputValidationTest(tu.TestResultCollector): + # for datatype mismatch + def test_datatype(self): + url = "http://localhost:8000/v2/models/libtorch_datatype_1_float32/infer" + body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__0"}]}' + response = requests.post(url, data=body) + msg = response.json()["error"] + self.assertTrue( + msg.startswith( + "configuration expects datatype TYPE_INT32 for output 'OUTPUT__0', model provides TYPE_FP32" + ) + ) + + # for output mismatch + def test_index(self): + url = "http://localhost:8000/v2/models/libtorch_index_1_float32/infer" + body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__1"}]}' + response = requests.post(url, data=body) + msg = response.json()["error"] + self.assertTrue( + msg.startswith( + "The output OUTPUT__1 in the model configuration refers to an output index which doesn't exist. This model has 1 outputs" + ) + ) + + # successful run + def test_success(self): + url = "http://localhost:8000/v2/models/libtorch_zero_1_float32/infer" + body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__0"}]}' + response = requests.post(url, data=body) + self.assertEqual(response.status_code, 200) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_output_validation/test.sh b/qa/L0_output_validation/test.sh new file mode 100755 index 0000000000..39874ff4fd --- /dev/null +++ b/qa/L0_output_validation/test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +LIBTORCH_OP_VAL_CLIENT=lt_op_val_client.py + +DATADIR=/data/inferenceserver/${REPO_VERSION}/libtorch_model_store2 +EXPECTED_NUM_TESTS="3" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR --exit-on-error=false" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +run_server_tolive +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# give plenty of time for model to load (and fail to load) +wait_for_model_stable $SERVER_TIMEOUT + +RET=0 +CLIENT_LOG=client.log +rm -f ./client.log + +set +e +python $LIBTORCH_OP_VAL_CLIENT >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_parallel_copy/parallel_copy_test.py b/qa/L0_parallel_copy/parallel_copy_test.py new file mode 100755 index 0000000000..6748fee006 --- /dev/null +++ b/qa/L0_parallel_copy/parallel_copy_test.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import functools +import time +import unittest +from builtins import range + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class ParallelCopyTest(tu.TestResultCollector): + def setUp(self): + self.client_ = grpcclient.InferenceServerClient("localhost:8001") + self.dtype_ = np.float32 + self.model_name_ = tu.get_zero_model_name("plan", 1, self.dtype_) + + def _batch_input_duration(self, batch_size): + stats = self.client_.get_inference_statistics(self.model_name_, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + self.assertEqual( + stats.model_stats[0].name, + self.model_name_, + "expect model stats for model {}".format(self.model_name_), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(self.model_name_), + ) + + batch_stats = stats.model_stats[0].batch_stats + + batch_input_duration = 0 + for batch_stat in batch_stats: + if batch_stat.batch_size == batch_size: + batch_input_duration = batch_stat.compute_input.ns + return batch_input_duration + + def _run(self, batch_sizes): + batch_size = functools.reduce(lambda a, b: a + b, batch_sizes, 0) + input_data = [ + np.random.random([bs, 16 * 1024 * 1024]).astype(self.dtype_) + for bs in batch_sizes + ] + inputs = [ + [grpcclient.InferInput("INPUT0", [bs, 16 * 1024 * 1024], "FP32")] + for bs in batch_sizes + ] + output = [grpcclient.InferRequestedOutput("OUTPUT0")] + + for idx in range(len(inputs)): + inputs[idx][0].set_data_from_numpy(input_data[idx]) + + def callback(user_data, idx, result, error): + if error: + user_data[idx] = error + else: + user_data[idx] = result + + # list to hold the results of inference. + user_data = [None] * len(batch_sizes) + + before_compute_input_duration = self._batch_input_duration(batch_size) + for idx in range(len(batch_sizes)): + self.client_.async_infer( + model_name=self.model_name_, + inputs=inputs[idx], + callback=functools.partial(callback, user_data, idx), + outputs=output, + ) + + # Wait until the results are available in user_data + time_out = 20 + while time_out > 0: + done = True + for res in user_data: + if res is None: + done = False + break + if done: + break + time_out = time_out - 1 + time.sleep(1) + done_cnt = functools.reduce( + lambda dc, x: dc + 1 if x is not None else dc, user_data, 0 + ) + self.assertEqual( + done_cnt, + len(batch_sizes), + "expected {} responses, got {}".format(len(batch_sizes), done_cnt), + ) + for idx in range(len(batch_sizes)): + res = user_data[idx] + self.assertFalse( + type(res) == InferenceServerException, + "expected response for request {}, got exception {}".format(idx, res), + ) + output_data = res.as_numpy("OUTPUT0") + self.assertTrue( + np.array_equal(output_data, input_data[idx]), + "Mismatched output data for request {}".format(idx), + ) + + after_compute_input_duration = self._batch_input_duration(batch_size) + return after_compute_input_duration - before_compute_input_duration + + def test_performance(self): + model_status = self.client_.is_model_ready(self.model_name_, "1") + self.assertTrue(model_status, "expected model to be ready") + + # Send 1 request with batch size 8 so that the copy is not parallelized + serialized_time = self._run([8]) + parallelized_time = self._run([2, 2, 2, 2]) + + # The following check is loose, local runs show that the speedup is not + # significant (~15%), may be due to the dispatch overhead + # which cancels part of the improvement + self.assertTrue( + serialized_time > parallelized_time, + "Expected parallelized copy is faster than serialized copy", + ) + print( + "serialized v.s. parallelized : {} v.s. {}".format( + serialized_time, parallelized_time + ) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_parallel_copy/test.sh b/qa/L0_parallel_copy/test.sh new file mode 100755 index 0000000000..24a673731a --- /dev/null +++ b/qa/L0_parallel_copy/test.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +PARALLEL_COPY_TEST=parallel_copy_test.py + +DATADIR="./models" + +rm -rf ${DATADIR} +mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_big_model_repository/plan_zero_1_float32 ${DATADIR}/ +# set queue delay to ensure the execution will be in full batch +(cd ${DATADIR}/plan_zero_1_float32 && \ + echo "dynamic_batching { " >> config.pbtxt && \ + echo " preferred_batch_size: [ 8 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 10000000" >> config.pbtxt && \ + echo "}" >> config.pbtxt) + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR --buffer-manager-thread-count=4" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f *.log* + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $PARALLEL_COPY_TEST >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_parameters/model_repository/ensemble/config.pbtxt b/qa/L0_parameters/model_repository/ensemble/config.pbtxt new file mode 100644 index 0000000000..383d89c9f6 --- /dev/null +++ b/qa/L0_parameters/model_repository/ensemble/config.pbtxt @@ -0,0 +1,68 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +platform: "ensemble" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "key" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "value" + data_type: TYPE_STRING + dims: [ -1 ] + } +] + +ensemble_scheduling +{ + step [ + { + model_name: "identity" + model_version: -1 + input_map { key: "INPUT0", value: "INPUT0" } + output_map { key: "OUTPUT0", value: "OUTPUT0" } + }, + { + model_name: "parameter" + model_version: -1 + input_map { key: "INPUT0", value: "OUTPUT0" } + output_map { key: "key", value: "key" } + output_map { key: "value", value: "value" } + } + ] +} diff --git a/qa/L0_parameters/model_repository/identity/config.pbtxt b/qa/L0_parameters/model_repository/identity/config.pbtxt new file mode 100644 index 0000000000..8908845574 --- /dev/null +++ b/qa/L0_parameters/model_repository/identity/config.pbtxt @@ -0,0 +1,44 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "identity" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] diff --git a/qa/L0_parameters/model_repository/parameter/1/model.py b/qa/L0_parameters/model_repository/parameter/1/model.py new file mode 100644 index 0000000000..c175860962 --- /dev/null +++ b/qa/L0_parameters/model_repository/parameter/1/model.py @@ -0,0 +1,77 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + inputs = [{"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [1]}] + outputs = [ + {"name": "key", "data_type": "TYPE_STRING", "dims": [-1]}, + {"name": "value", "data_type": "TYPE_STRING", "dims": [-1]}, + ] + + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + for input in config["input"]: + input_names.append(input["name"]) + for output in config["output"]: + output_names.append(output["name"]) + + for input in inputs: + if input["name"] not in input_names: + auto_complete_model_config.add_input(input) + for output in outputs: + if output["name"] not in output_names: + auto_complete_model_config.add_output(output) + + auto_complete_model_config.set_max_batch_size(0) + return auto_complete_model_config + + def execute(self, requests): + # A simple model that puts the request parameters into the outputs. + responses = [] + for request in requests: + parameters = json.loads(request.parameters()) + keys = [] + values = [] + for key, value in parameters.items(): + keys.append(key) + values.append(value) + key_output = pb_utils.Tensor("key", np.asarray(keys, dtype=object)) + value_output = pb_utils.Tensor("value", np.asarray(values, dtype=object)) + inference_response = pb_utils.InferenceResponse( + output_tensors=[key_output, value_output] + ) + responses.append(inference_response) + + return responses diff --git a/qa/L0_parameters/parameters_test.py b/qa/L0_parameters/parameters_test.py new file mode 100755 index 0000000000..a20d13c1eb --- /dev/null +++ b/qa/L0_parameters/parameters_test.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import queue +import unittest +from functools import partial +from unittest import IsolatedAsyncioTestCase + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.grpc.aio as asyncgrpcclient +import tritonclient.http as httpclient +import tritonclient.http.aio as asynchttpclient +from tritonclient.utils import InferenceServerException + +TEST_HEADER = os.environ.get("TEST_HEADER") + + +class InferenceParametersTest(IsolatedAsyncioTestCase): + async def asyncSetUp(self): + self.http = httpclient.InferenceServerClient(url="localhost:8000") + self.async_http = asynchttpclient.InferenceServerClient(url="localhost:8000") + self.grpc = grpcclient.InferenceServerClient(url="localhost:8001") + self.async_grpc = asyncgrpcclient.InferenceServerClient(url="localhost:8001") + + self.parameter_list = [] + self.parameter_list.append({"key1": "value1", "key2": "value2"}) + self.parameter_list.append({"key1": 1, "key2": 2}) + self.parameter_list.append({"key1": 123.123, "key2": 321.321}) + self.parameter_list.append({"key1": True, "key2": "value2"}) + self.parameter_list.append({"triton_": True, "key2": "value2"}) + + # Only "test_params" tests parameters without headers. + if TEST_HEADER != "test_params": + self.headers = { + "header_1": "value_1", + "header_2": "value_2", + "my_header_1": "my_value_1", + "my_header_2": "my_value_2", + "my_header_3": 'This is a "quoted" string with a backslash\ ', + } + + # only these headers should be forwarded to the model. + if TEST_HEADER == "test_grpc_header_forward_pattern_case_sensitive": + self.expected_headers = {} + else: + self.expected_headers = { + "my_header_1": "my_value_1", + "my_header_2": "my_value_2", + "my_header_3": 'This is a "quoted" string with a backslash\ ', + } + else: + self.headers = {} + self.expected_headers = {} + + def callback(user_data, result, error): + if error: + user_data.put(error) + else: + user_data.put(result) + + self.grpc_callback = callback + + def create_inputs(self, client_type): + inputs = [] + inputs.append(client_type.InferInput("INPUT0", [1], "FP32")) + + # Initialize the data + inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.float32)) + return inputs + + async def send_request_and_verify( + self, client_type, client, is_async=False, model_name="parameter" + ): + inputs = self.create_inputs(client_type) + for parameters in self.parameter_list: + # Setup infer callable to re-use below for brevity + infer_callable = partial( + client.infer, + model_name=model_name, + inputs=inputs, + parameters=parameters, + headers=self.headers, + ) + + # The `triton_` prefix is reserved for Triton usage + should_error = False + if "triton_" in parameters.keys(): + should_error = True + + if is_async: + if should_error: + with self.assertRaises(InferenceServerException): + await infer_callable() + return + else: + result = await infer_callable() + else: + if should_error: + with self.assertRaises(InferenceServerException): + infer_callable() + return + else: + result = infer_callable() + + self.verify_outputs(result, parameters) + + def verify_outputs(self, result, parameters): + keys = result.as_numpy("key") + values = result.as_numpy("value") + keys = keys.astype(str).tolist() + expected_keys = list(parameters.keys()) + list(self.expected_headers.keys()) + self.assertEqual(set(keys), set(expected_keys)) + + # We have to convert the parameter values to string + expected_values = [] + for expected_value in list(parameters.values()): + expected_values.append(str(expected_value)) + for value in self.expected_headers.values(): + expected_values.append(value) + self.assertEqual(set(values.astype(str).tolist()), set(expected_values)) + + async def test_grpc_parameter(self): + await self.send_request_and_verify(grpcclient, self.grpc) + + async def test_http_parameter(self): + await self.send_request_and_verify(httpclient, self.http) + + async def test_async_http_parameter(self): + await self.send_request_and_verify( + asynchttpclient, self.async_http, is_async=True + ) + + async def test_async_grpc_parameter(self): + await self.send_request_and_verify( + asyncgrpcclient, self.async_grpc, is_async=True + ) + + def test_http_async_parameter(self): + inputs = self.create_inputs(httpclient) + # Skip the parameter that returns an error + parameter_list = self.parameter_list[:-1] + for parameters in parameter_list: + result = self.http.async_infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + ).get_result() + self.verify_outputs(result, parameters) + + def test_grpc_async_parameter(self): + user_data = queue.Queue() + inputs = self.create_inputs(grpcclient) + # Skip the parameter that returns an error + parameter_list = self.parameter_list[:-1] + for parameters in parameter_list: + self.grpc.async_infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + callback=partial(self.grpc_callback, user_data), + ) + result = user_data.get() + self.assertFalse(result is InferenceServerException) + self.verify_outputs(result, parameters) + + def test_grpc_stream_parameter(self): + user_data = queue.Queue() + self.grpc.start_stream( + callback=partial(self.grpc_callback, user_data), headers=self.headers + ) + inputs = self.create_inputs(grpcclient) + # Skip the parameter that returns an error + parameter_list = self.parameter_list[:-1] + for parameters in parameter_list: + # async stream infer + self.grpc.async_stream_infer( + model_name="parameter", inputs=inputs, parameters=parameters + ) + result = user_data.get() + self.assertFalse(result is InferenceServerException) + self.verify_outputs(result, parameters) + self.grpc.stop_stream() + + async def test_ensemble_parameter_forwarding(self): + await self.send_request_and_verify(httpclient, self.http, model_name="ensemble") + + async def asyncTearDown(self): + self.http.close() + self.grpc.close() + await self.async_grpc.close() + await self.async_http.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_parameters/test.sh b/qa/L0_parameters/test.sh new file mode 100755 index 0000000000..c53b02d4b7 --- /dev/null +++ b/qa/L0_parameters/test.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +CLIENT_LOG="./client.log" +TEST_SCRIPT_PY="parameters_test.py" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +MODELDIR="model_repository" +# Use identity model as dummy step to ensure parameters pass through each step +mkdir -p "${MODELDIR}/identity/1" +mkdir -p "${MODELDIR}/ensemble/1" + +# TODO: Add support and testing for C++ client parameters: +# https://jirasw.nvidia.com/browse/DLIS-4673 + +all_tests=("test_params" + "test_headers" + "test_header_forward_pattern_case_insensitive" + "test_grpc_header_forward_pattern_case_sensitive") + +RET=0 +for i in "${all_tests[@]}"; do + # TEST_HEADER is a parameter used by `parameters_test.py` that controls + # whether the script will test for inclusion of headers in parameters or not. + SERVER_ARGS="--model-repository=${MODELDIR} --exit-timeout-secs=120" + if [ "$i" == "test_headers" ]; then + SERVER_ARGS+=" --grpc-header-forward-pattern my_header.*" + SERVER_ARGS+=" --http-header-forward-pattern my_header.*" + elif [ "$i" == "test_header_forward_pattern_case_insensitive" ]; then + SERVER_ARGS+=" --grpc-header-forward-pattern MY_HEADER.*" + SERVER_ARGS+=" --http-header-forward-pattern MY_HEADER.*" + # NOTE: headers sent through the python HTTP client may be automatically + # lowercased by internal libraries like geventhttpclient, so we only test + # GRPC client for case-sensitivity here: + # https://github.com/geventhttpclient/geventhttpclient/blob/d1e14356c3b02099c879cf9b3bdb684a0cbd8bf5/src/geventhttpclient/header.py#L62-L63 + elif [ "$i" == "test_grpc_header_forward_pattern_case_sensitive" ]; then + SERVER_ARGS+=" --grpc-header-forward-pattern (?-i)MY_HEADER.*" + fi + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + TEST_HEADER="$i" python3 $TEST_SCRIPT_PY >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET + diff --git a/qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt b/qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt new file mode 100644 index 0000000000..72d041feac --- /dev/null +++ b/qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt @@ -0,0 +1,62 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "distributed_int32_int32_int32" +backend: "distributed_addsub" +max_batch_size: 1 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + }, + { + kind: KIND_GPU + passive: true + } +] \ No newline at end of file diff --git a/qa/L0_passive_instance/passive_instance_test.py b/qa/L0_passive_instance/passive_instance_test.py new file mode 100755 index 0000000000..d7cdfffa7b --- /dev/null +++ b/qa/L0_passive_instance/passive_instance_test.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + + +class PassiveInstanceTest(tu.TestResultCollector): + def test_inference(self): + try: + iu.infer_exact( + self, "distributed", (1, 16), 1, np.int32, np.int32, np.int32 + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_passive_instance/test.sh b/qa/L0_passive_instance/test.sh new file mode 100755 index 0000000000..8948434485 --- /dev/null +++ b/qa/L0_passive_instance/test.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +TEST_SCRIPT_PY=passive_instance_test.py +EXPECTED_NUM_TESTS="1" + +PERF_ANALYZER=../clients/perf_analyzer +MODEL=distributed_int32_int32_int32 + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --exit-timeout-secs=120" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f $SERVER_LOG $CLIENT_LOG + +mkdir -p models/${MODEL}/1 + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST_SCRIPT_PY >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +# Generate concurrency, check if only the CPU instances are accepting requests +$PERF_ANALYZER -m $MODEL --concurrency-range 4 >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** perf_analyzer for $MODEL failed\n***" + RET=1 +fi + +grep "(GPU device 0), executing" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expecting no request sent to GPU instance\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_analyzer/nginx.conf b/qa/L0_perf_analyzer/nginx.conf new file mode 100644 index 0000000000..4a7dfcc04a --- /dev/null +++ b/qa/L0_perf_analyzer/nginx.conf @@ -0,0 +1,38 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +server { + listen 443 ssl; + server_name localhost; + + ssl_certificate /etc/nginx/cert.crt; + ssl_certificate_key /etc/nginx/cert.key; + + location / { + proxy_pass http://localhost:8000; + proxy_http_version 1.1; + } +} diff --git a/qa/L0_perf_analyzer/perf_analyzer_profile_export_schema.json b/qa/L0_perf_analyzer/perf_analyzer_profile_export_schema.json new file mode 100644 index 0000000000..d0feacd9b4 --- /dev/null +++ b/qa/L0_perf_analyzer/perf_analyzer_profile_export_schema.json @@ -0,0 +1,95 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/examples/schema.json", + "title": "Perf Analyzer output data", + "description": "A json file describing the output from a Perf Analyzer run.", + "type": "object", + "required": [ + "experiments", + "version" + ], + "properties": { + "experiments": { + "description": "The array of all experiments run by Perf Analyzer.", + "type": "array", + "required": [ + "experiment", + "requests", + "window_boundaries" + ], + "minItems": 1, + "uniqueItems": true, + "items": { + "type": "object", + "properties": { + "experiment": { + "description": "A single experiment run by Perf Analyzer.", + "type": "object", + "required": [ + "mode", + "value" + ], + "minItems": 1, + "maxItems": 1, + "properties": { + "mode": { + "description": "Operating mode of Perf Analyzer: For example, 'concurrency' or 'request rate'.", + "type": "string" + }, + "value": { + "description": "Concurrency or request rate for the current experiment.", + "type": "integer" + } + } + }, + "requests": { + "description": "The array of requests sent by Perf Analyzer for this experiment.", + "type": "array", + "items": { + "$ref": "#/properties/experiments/items/properties/$defs/request" + } + }, + "$defs": { + "request": { + "description": "Info for a single request.", + "type": "object", + "required": [ + "timestamp", + "response_timestamps" + ], + "properties": { + "timestamp": { + "description": "Time stamp of the request.", + "type": "integer" + }, + "sequence_id": { + "description": "The sequence_id of the request.", + "type": "integer" + }, + "response_timestamps": { + "description": "All associated responses to this request.", + "type": "array", + "items": { + "type": "integer" + } + } + } + } + }, + "window_boundaries": { + "description": "An array of time stamps describing window boundaries.", + "type": "array", + "items": { + "type": "integer" + }, + "uniqueItems": true + } + } + } + }, + "version": { + "description": "The version of Perf Analyzer that generated the report.", + "type": "string" + } + } +} \ No newline at end of file diff --git a/qa/L0_perf_analyzer/test.sh b/qa/L0_perf_analyzer/test.sh new file mode 100755 index 0000000000..49c7e72e48 --- /dev/null +++ b/qa/L0_perf_analyzer/test.sh @@ -0,0 +1,1164 @@ +#!/bin/bash +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./perf_analyzer.log" +PERF_ANALYZER=../clients/perf_analyzer + +DATADIR=`pwd`/models +TESTDATADIR=`pwd`/test_data + +INT_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/int_data.json +INT_DIFFSHAPE_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/int_data_diff_shape.json +INT_OPTIONAL_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/int_data_optional.json +FLOAT_DIFFSHAPE_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/float_data_with_shape.json +STRING_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/string_data.json +STRING_WITHSHAPE_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/string_data_with_shape.json +SEQ_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/seq_data.json +SHAPETENSORADTAFILE=`pwd`/../common/perf_analyzer_input_data_json/shape_tensor_data.json +IMAGE_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/image_data.json + +OUTPUT_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/output.json +NON_ALIGNED_OUTPUT_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/non_aligned_output.json +WRONG_OUTPUT_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/wrong_output.json +WRONG_OUTPUT_2_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/wrong_output_2.json + +SEQ_OUTPUT_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/seq_output.json +SEQ_WRONG_OUTPUT_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/seq_wrong_output.json + +REPEAT_INT32_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/repeat_int32_data.json + +TRACE_FILE="trace.json" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=${DATADIR} --trace-config triton,file=${TRACE_FILE}" +SERVER_LOG="./inference_server.log" + +ERROR_STRING="error | Request count: 0 | : 0 infer/sec" + +STABILITY_THRESHOLD="100" + +source ../common/util.sh + +rm -f $SERVER_LOG $CLIENT_LOG +rm -rf $DATADIR $TESTDATADIR $ENSEMBLE_DATADIR + +mkdir -p $DATADIR +# Copy fixed-shape models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/graphdef_int32_int32_int32 $DATADIR/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/graphdef_nobatch_int32_int32_int32 $DATADIR/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/graphdef_object_object_object $DATADIR/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/graphdef_nobatch_object_object_object $DATADIR/ + +# Copy a variable-shape models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/graphdef_object_int32_int32 $DATADIR/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/graphdef_int32_int32_float32 $DATADIR/ + +# Copy shape tensor models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_shapetensor_model_repository/plan_zero_1_float32_int32 $DATADIR/ + +# Copying ensemble including a sequential model +cp -r /data/inferenceserver/${REPO_VERSION}/qa_sequence_model_repository/savedmodel_sequence_object $DATADIR +cp -r /data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_sequence_model_repository/simple_savedmodel_sequence_object $DATADIR +cp -r /data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_sequence_model_repository/nop_TYPE_FP32_-1 $DATADIR + +# Copying variable sequence model +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_sequence_model_repository/graphdef_sequence_float32 $DATADIR + +mkdir $DATADIR/nop_TYPE_FP32_-1/1 + +# Copy inception model to the model repository +cp -r /data/inferenceserver/${REPO_VERSION}/tf_model_store/inception_v1_graphdef $DATADIR + +# Copy resnet50v1.5_fp16 +cp -r /data/inferenceserver/${REPO_VERSION}/perf_model_store/resnet50v1.5_fp16_savedmodel $DATADIR + +# Copy and customize custom_zero_1_float32 +cp -r ../custom_models/custom_zero_1_float32 $DATADIR && \ + mkdir $DATADIR/custom_zero_1_float32/1 && \ + (cd $DATADIR/custom_zero_1_float32 && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"100\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +# Copy and customize optional inputs model +cp -r ../python_models/optional $DATADIR && \ + mkdir $DATADIR/optional/1 && \ + mv $DATADIR/optional/model.py $DATADIR/optional/1 && \ + sed -i 's/max_batch_size: 0/max_batch_size: 2/g' $DATADIR/optional/config.pbtxt + +# Copy decoupled model +git clone --depth=1 https://github.com/triton-inference-server/python_backend +mkdir -p $DATADIR/repeat_int32/1 +cp python_backend/examples/decoupled/repeat_config.pbtxt $DATADIR/repeat_int32/config.pbtxt +cp python_backend/examples/decoupled/repeat_model.py $DATADIR/repeat_int32/1/model.py + +# Generating test data +mkdir -p $TESTDATADIR +for INPUT in INPUT0 INPUT1; do + for i in {1..16}; do + echo '1' >> $TESTDATADIR/${INPUT} + done +done + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + + +# Test whether there was a conflict in sending sequences. This should +# be done before other testing as the server might emit this warning +# in certain test cases that are expected to raise this warning +SERVER_ERROR_STRING="The previous sequence did not end before this sequence start" + +set +e +$PERF_ANALYZER -v -i $PROTOCOL -m graphdef_object_object_object -p2000 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed: Expected an error when using dynamic shapes in string inputs\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "input INPUT0 contains dynamic shape, provide shapes to send along with the request" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed: \n***" + RET=1 +fi + +# Testing with ensemble and sequential model variants +$PERF_ANALYZER -v -i grpc -m simple_savedmodel_sequence_object -p 2000 -t5 --streaming \ +--input-data=$SEQ_JSONDATAFILE --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed: Sequence conflict when maintaining concurrency\n***" + RET=1 +fi + +$PERF_ANALYZER -v -i grpc -m simple_savedmodel_sequence_object -p 1000 --request-rate-range 100:200:50 --streaming \ +--input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ $(cat $SERVER_LOG | grep "${SERVER_ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $SERVER_LOG | grep "${SERVER_ERROR_STRING}" + echo -e "\n***\n*** Test Failed: Sequence conflict\n***" + RET=1 +fi +set -e + +for PROTOCOL in grpc http; do + + # Testing simple configurations with different shared memory types + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 -t 1 -p2000 -b 1 \ + --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 -t 1 -p2000 -b 1 -a \ + --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + # TODO Add back testing with preprocess_inception_ensemble model + + # Testing with inception model + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m inception_v1_graphdef -t 1 -p2000 -b 1 \ + --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m inception_v1_graphdef -t 1 -p2000 -b 1 -a \ + --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + # Testing with resnet50 models with large batch sizes + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m inception_v1_graphdef -t 2 -p2000 -b 64 \ + --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m inception_v1_graphdef -t 2 -p2000 -b 64 \ + --shared-memory=$SHARED_MEMORY_TYPE -a -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + # Test perf client behavior on different model with different batch size + for MODEL in graphdef_nobatch_int32_int32_int32 graphdef_int32_int32_int32; do + # Valid batch size + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m $MODEL -t 1 -p2000 -b 1 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + # Invalid batch sizes + for STATIC_BATCH in 0 10; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m $MODEL -t 1 -p2000 -b $STATIC_BATCH -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + done + + # Testing with the new arguments + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 --concurrency-range 1:5:2 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "error | Request count: 0 | : 0 infer/sec\|: 0 usec|Request concurrency: 2" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 --concurrency-range 1:5:2 \ + --input-data=${INT_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "error | Request count: 0 | : 0 infer/sec\|: 0 usec|Request concurrency: 2" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 --request-rate-range 1000:2000:500 \ + -p1000 -b 1 -a -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 --request-rate-range 1000:2000:500 \ + --input-data=${INT_JSONDATAFILE} -p1000 -b 1 -a -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + # Binary search for request rate mode + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_int32 --request-rate-range 1000:2000:100 -p1000 -b 1 \ + -a --binary-search --request-distribution "poisson" -l 10 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + # Binary search for concurrency range mode and make sure it doesn't hang + $PERF_ANALYZER -v -a --request-distribution "poisson" --shared-memory none \ + --percentile 99 --binary-search --concurrency-range 1:8:2 -l 5 \ + -m graphdef_int32_int32_int32 -b 1 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 & + PA_PID=$! + if [ "$PA_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $PERF_ANALYZER\n***" + cat $CLIENT_LOG + RET=1 + fi + # wait for PA to finish running + sleep 200 + if ps -p $PA_PID > /dev/null; then + cat $CLIENT_LOG + echo -e "\n***\n*** $PERF_ANALYZER is hanging after 200 s\n***" + kill $PA_PID + RET=1 + fi + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + # Testing with combinations of string input and shared memory types + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_object_object_object --string-data=1 -p2000 \ + --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + # Testing with combinations of file inputs and shared memory types + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_object_object_object --input-data=$TESTDATADIR -p2000 \ + --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_object_object_object --input-data=$STRING_JSONDATAFILE \ + --input-data=$STRING_JSONDATAFILE -p2000 --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + # Testing with combinations of variable inputs and shared memory types + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_object_int32_int32 --input-data=$TESTDATADIR \ + --shape INPUT0:2,8 --shape INPUT1:2,8 -p2000 --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} \ + >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_object_int32_int32 --input-data=$STRING_WITHSHAPE_JSONDATAFILE \ + --shape INPUT0:2,8 --shape INPUT1:2,8 -p2000 --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} \ + >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_float32 --shape INPUT0:2,8,2 \ + --shape INPUT1:2,8,2 -p2000 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + # Trying to batch tensors with different shape + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m graphdef_int32_int32_float32 --shape INPUT0:2,8,2 --shape INPUT1:2,8,2 -p2000 -b 4 \ + --shared-memory=$SHARED_MEMORY_TYPE --input-data=$INT_DIFFSHAPE_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep -P "The supplied shape .+ is incompatible with the model's input shape" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + # Shape tensor I/O model (server needs the shape tensor on the CPU) + for SHARED_MEMORY_TYPE in none system; do + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m plan_zero_1_float32_int32 --input-data=$SHAPETENSORADTAFILE \ + --shape DUMMY_INPUT0:4,4 -p2000 --shared-memory=$SHARED_MEMORY_TYPE -b 8 -s ${STABILITY_THRESHOLD} \ + >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep ": 0 infer/sec\|: 0 usec" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object -p 2000 -t5 --sync \ + --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object -p 2000 -t5 --sync \ + --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + $PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object -p 1000 --request-rate-range 100:200:50 --sync \ + --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + + # Testing with variable ensemble model. This unit specifies different shape values + # for different inferences. + for SHARED_MEMORY_TYPE in none system cuda; do + set +e + # FIXME: Enable HTTP when the server is able to correctly return the complex error messages. + $PERF_ANALYZER -v -i grpc -m graphdef_sequence_float32 --shape INPUT:2 --input-data=$FLOAT_DIFFSHAPE_JSONDATAFILE \ + --input-data=$FLOAT_DIFFSHAPE_JSONDATAFILE -p2000 --shared-memory=$SHARED_MEMORY_TYPE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep -P "The supplied shape .+ is incompatible with the model's input shape" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + done + + # Testing that trace logging works + set +e + rm ${TRACE_FILE}* + $PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object -p 2000 -t5 --sync \ + --trace-level TIMESTAMPS --trace-rate 1000 --trace-count 100 --log-frequency 10 \ + --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if ! compgen -G "$TRACE_FILE*" > /dev/null; then + echo -e "\n***\n*** Test Failed. $TRACE_FILE failed to generate.\n***" + RET=1 + elif [ $(cat ${TRACE_FILE}* | grep "REQUEST_START" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed. Did not find `REQUEST_START` in $TRACE_FILE \n***" + RET=1 + fi + curl localhost:8000/v2/trace/setting -d '{"trace_level":["OFF"]}' + set -e + + # Testing that setting trace file does not work + set +e + $PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object \ + --trace-file $TRACE_FILE >$CLIENT_LOG 2>&1 + if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed. Expected to fail for unknown arg --trace-file" + RET=1 + fi + curl localhost:8000/v2/trace/setting -d '{"trace_level":["OFF"]}' + set -e +done + +# Test with output validation +set +e +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 --input-data=${NON_ALIGNED_OUTPUT_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "The 'validation_data' field doesn't align with 'data' field in the json file" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 --input-data=${WRONG_OUTPUT_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "mismatch in the data provided" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 --input-data=${WRONG_OUTPUT_2_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "Output doesn't match expected output" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + + +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 --input-data=${OUTPUT_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m simple_savedmodel_sequence_object -i grpc --streaming \ +--input-data=${SEQ_WRONG_OUTPUT_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "Output doesn't match expected output" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m simple_savedmodel_sequence_object -i grpc --streaming \ +--input-data=${SEQ_OUTPUT_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +## Testing with very large concurrencies and large dataset +INPUT_DATA_OPTION="--input-data $SEQ_JSONDATAFILE " +for i in {1..9}; do + INPUT_DATA_OPTION=" ${INPUT_DATA_OPTION} ${INPUT_DATA_OPTION}" +done +set +e +$PERF_ANALYZER -v -m simple_savedmodel_sequence_object -p 10000 --concurrency-range 1500:2000:250 -i grpc --streaming \ +${INPUT_DATA_OPTION} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +## Test count_windows mode +set +e + +# Send incorrect shape and make sure that perf_analyzer doesn't hang +$PERF_ANALYZER -v -m graphdef_object_int32_int32 --measurement-mode "count_windows" \ + --shape INPUT0:1,8,100 --shape INPUT1:2,8 --string-data=1 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "unexpected shape for input 'INPUT0' for model" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m graphdef_object_int32_int32 --measurement-mode "count_windows" \ + --shape INPUT0:2,8 --shape INPUT1:2,8 --string-data=1 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +# Test with optional inputs missing but still valid +set +e +$PERF_ANALYZER -v -m optional --measurement-mode "count_windows" \ + --input-data=${INT_OPTIONAL_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +# Test with optional inputs missing and invalid +set +e +OPTIONAL_INPUT_ERROR_STRING="For batch sizes larger than 1, the same set of +inputs must be specified for each batch. You cannot use different set of +optional inputs for each individual batch." +$PERF_ANALYZER -v -m optional -b 2 --measurement-mode "count_windows" \ + --input-data=${INT_OPTIONAL_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${OPTIONAL_INPUT_ERROR_STRING}" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + + +# Test Custom request rate option +CUSTOM_SCHEDULE_FILE=$TESTDATADIR/custom.schedule +echo '30000' >> $CUSTOM_SCHEDULE_FILE +echo '10000' >> $CUSTOM_SCHEDULE_FILE +echo '40000' >> $CUSTOM_SCHEDULE_FILE +echo '20000' >> $CUSTOM_SCHEDULE_FILE +echo '25000' >> $CUSTOM_SCHEDULE_FILE + +set +e +$PERF_ANALYZER -v -i grpc -m graphdef_int32_int32_int32 --request-intervals $CUSTOM_SCHEDULE_FILE >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "Request Rate: 40" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed: \n***" + RET=1 +fi +set -e + +# Test --serial-sequences mode +set +e +$PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object -p 1000 --request-rate-range 100:200:50 --serial-sequences \ + --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object -p 1000 --request-intervals $CUSTOM_SCHEDULE_FILE --serial-sequences \ + --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +## Test decoupled model support +$PERF_ANALYZER -v -m repeat_int32 --input-data=$REPEAT_INT32_JSONDATAFILE \ + --profile-export-file profile_export.json -i grpc --async --streaming -s \ + ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +python3 -c "import json ; \ + requests = json.load(open('profile_export.json'))['experiments'][0]['requests'] ; \ + assert any(len(r['response_timestamps']) > 1 for r in requests)" +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +check-jsonschema --schemafile perf_analyzer_profile_export_schema.json profile_export.json +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +## Test perf_analyzer with MPI / multiple models + +is_synchronized() { + local TIMESTAMP_RANK_0_STABLE=$(grep -oP "^\K[^$]+(?=\[1,0\]:All models on all MPI ranks are stable)" 1/rank.0/stdout | date "+%s" -f -) + local TIMESTAMP_RANK_1_STABLE=$(grep -oP "^\K[^$]+(?=\[1,1\]:All models on all MPI ranks are stable)" 1/rank.1/stdout | date "+%s" -f -) + local TIMESTAMP_RANK_2_STABLE=$(grep -oP "^\K[^$]+(?=\[1,2\]:All models on all MPI ranks are stable)" 1/rank.2/stdout | date "+%s" -f -) + local TIMESTAMP_MIN=$(echo -e "${TIMESTAMP_RANK_0_STABLE}\n${TIMESTAMP_RANK_1_STABLE}\n${TIMESTAMP_RANK_2_STABLE}" | sort -n | head -1) + local TIMESTAMP_MAX=$(echo -e "${TIMESTAMP_RANK_0_STABLE}\n${TIMESTAMP_RANK_1_STABLE}\n${TIMESTAMP_RANK_2_STABLE}" | sort -n | tail -1) + local TIMESTAMP_MAX_MIN_DIFFERENCE=$((${TIMESTAMP_MAX}-${TIMESTAMP_MIN})) + local ALLOWABLE_SECONDS_BETWEEN_PROFILES_FINISHING="5" + echo $(($TIMESTAMP_MAX_MIN_DIFFERENCE <= $ALLOWABLE_SECONDS_BETWEEN_PROFILES_FINISHING)) +} + +is_stable() { + local RANK=$1 + local IS_THROUGHPUT=$2 + if [ $IS_THROUGHPUT ]; then + local GREP_PATTERN="\[1,$RANK\]: Pass \[[0-9]+\] throughput: \K[0-9]+\.?[0-9]*" + else + local GREP_PATTERN="\[1,$RANK\]: Pass \[[0-9]+\] throughput: [0-9]+\.?[0-9]* infer/sec. Avg latency: \K[0-9]+" + fi + local LAST_MINUS_0=$(grep -oP "$GREP_PATTERN" 1/rank.$RANK/stdout | tail -3 | sed -n 3p) + local LAST_MINUS_1=$(grep -oP "$GREP_PATTERN" 1/rank.$RANK/stdout | tail -3 | sed -n 2p) + local LAST_MINUS_2=$(grep -oP "$GREP_PATTERN" 1/rank.$RANK/stdout | tail -3 | sed -n 1p) + local MEAN=$(awk "BEGIN {print (($LAST_MINUS_0+$LAST_MINUS_1+$LAST_MINUS_2)/3)}") + local STABILITY_THRESHOLD=0.5 + # Based on this: https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/inference_profiler.cc#L629-L644 + local WITHIN_THRESHOLD_0=$(awk "BEGIN {print ($LAST_MINUS_0 >= ((1 - $STABILITY_THRESHOLD) * $MEAN) && $LAST_MINUS_0 <= ((1 + $STABILITY_THRESHOLD) * $MEAN))}") + local WITHIN_THRESHOLD_1=$(awk "BEGIN {print ($LAST_MINUS_1 >= ((1 - $STABILITY_THRESHOLD) * $MEAN) && $LAST_MINUS_1 <= ((1 + $STABILITY_THRESHOLD) * $MEAN))}") + local WITHIN_THRESHOLD_2=$(awk "BEGIN {print ($LAST_MINUS_2 >= ((1 - $STABILITY_THRESHOLD) * $MEAN) && $LAST_MINUS_2 <= ((1 + $STABILITY_THRESHOLD) * $MEAN))}") + echo $(($WITHIN_THRESHOLD_0 && $WITHIN_THRESHOLD_1 && $WITHIN_THRESHOLD_2)) +} + +set +e +mpiexec --allow-run-as-root \ + -n 1 --merge-stderr-to-stdout --output-filename . --tag-output --timestamp-output \ + $PERF_ANALYZER -v -m graphdef_int32_int32_int32 \ + --measurement-mode count_windows -s 50 --enable-mpi : \ + -n 1 --merge-stderr-to-stdout --output-filename . --tag-output --timestamp-output \ + $PERF_ANALYZER -v -m graphdef_nobatch_int32_int32_int32 \ + --measurement-mode count_windows -s 50 --enable-mpi : \ + -n 1 --merge-stderr-to-stdout --output-filename . --tag-output --timestamp-output \ + $PERF_ANALYZER -v -m custom_zero_1_float32 \ + --measurement-mode count_windows -s 50 --enable-mpi +if [ $? -ne 0 ]; then + cat 1/rank.0/stdout 1/rank.2/stdout 1/rank.2/stdout + echo -e "\n***\n*** Perf Analyzer returned non-zero exit code\n***" + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + if [ $(is_synchronized) -eq 0 ]; then + cat 1/rank.0/stdout 1/rank.2/stdout 1/rank.2/stdout + echo -e "\n***\n*** All models did not finish profiling at almost the same time\n***" + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + RANK_0_THROUGHPUT_IS_STABLE=$(is_stable 0 1) + RANK_0_LATENCY_IS_STABLE=$(is_stable 0 0) + RANK_1_THROUGHPUT_IS_STABLE=$(is_stable 1 1) + RANK_1_LATENCY_IS_STABLE=$(is_stable 1 0) + RANK_2_THROUGHPUT_IS_STABLE=$(is_stable 2 1) + RANK_2_LATENCY_IS_STABLE=$(is_stable 2 0) + + ALL_STABLE=$(( \ + $RANK_0_THROUGHPUT_IS_STABLE && \ + $RANK_0_LATENCY_IS_STABLE && \ + $RANK_1_THROUGHPUT_IS_STABLE && \ + $RANK_1_LATENCY_IS_STABLE && \ + $RANK_2_THROUGHPUT_IS_STABLE && \ + $RANK_2_LATENCY_IS_STABLE)) + + if [ $ALL_STABLE -eq 0 ]; then + cat 1/rank.0/stdout 1/rank.2/stdout 1/rank.2/stdout + echo -e "\n***\n*** All models did not stabilize\n***" + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + rm -rf 1 +fi +set -e + +## Test perf_analyzer without MPI library (`libmpi.so`) available + +rm -rf /opt/hpcx/ompi/lib/libmpi* + +set +e +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Generate valid CA +openssl genrsa -passout pass:1234 -des3 -out ca.key 4096 +openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA" + +# Generate valid Server Key/Cert +openssl genrsa -passout pass:1234 -des3 -out server.key 4096 +openssl req -passin pass:1234 -new -key server.key -out server.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost" +openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt + +# Remove passphrase from the Server Key +openssl rsa -passin pass:1234 -in server.key -out server.key + +# Generate valid Client Key/Cert +openssl genrsa -passout pass:1234 -des3 -out client.key 4096 +openssl req -passin pass:1234 -new -key client.key -out client.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost" +openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt + +# Remove passphrase from Client Key +openssl rsa -passin pass:1234 -in client.key -out client.key + +# Create mutated client key (Make first char of each like capital) +cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key +cp client.crt client2.crt && sed -i "s/\b\(.\)/\u\1/g" client2.crt + +SERVER_ARGS="--model-repository=${DATADIR} --grpc-use-ssl=1 --grpc-server-cert=server.crt --grpc-server-key=server.key --grpc-root-cert=ca.crt" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Test gRPC SSL +set +e + +# Test that gRPC protocol with SSL works correctly +$PERF_ANALYZER -v -i grpc -m graphdef_int32_int32_int32 \ + --ssl-grpc-use-ssl \ + --ssl-grpc-root-certifications-file=ca.crt \ + --ssl-grpc-private-key-file=client.key \ + --ssl-grpc-certificate-chain-file=client.crt \ + -s ${STABILITY_THRESHOLD} \ + > ${CLIENT_LOG}.grpc_success 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.grpc_success + RET=1 +fi + +# Test that gRPC protocol with SSL fails with incorrect key +$PERF_ANALYZER -v -i grpc -m graphdef_int32_int32_int32 \ + --ssl-grpc-use-ssl \ + --ssl-grpc-root-certifications-file=ca.crt \ + --ssl-grpc-private-key-file=client.key \ + --ssl-grpc-certificate-chain-file=client2.crt \ + -s ${STABILITY_THRESHOLD} \ + > ${CLIENT_LOG}.grpc_failure 2>&1 +if [ $? -eq 0 ]; then + cat ${CLIENT_LOG}.grpc_failure + echo -e "\n***\n*** Expected test failure\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +cp server.crt /etc/nginx/cert.crt +cp server.key /etc/nginx/cert.key + +SERVER_ARGS="--model-repository=${DATADIR}" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Setup the new configuration for the proxy. The HTTPS traffic will be +# redirected to the running instance of server at localhost:8000 +cp nginx.conf /etc/nginx/sites-available/default + +# Start the proxy server +service nginx restart + +# Test HTTP SSL +set +e + +# Test that HTTP protocol with SSL works correctly with certificates +$PERF_ANALYZER -v -u https://localhost:443 -i http -m graphdef_int32_int32_int32 \ + --ssl-https-verify-peer 1 \ + --ssl-https-verify-host 2 \ + --ssl-https-ca-certificates-file ca.crt \ + --ssl-https-client-certificate-file client.crt \ + --ssl-https-client-certificate-type PEM \ + --ssl-https-private-key-file client.key \ + --ssl-https-private-key-type PEM \ + -s ${STABILITY_THRESHOLD} \ + > ${CLIENT_LOG}.https_success 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.https_success + RET=1 +fi + +# Test that HTTP protocol with SSL works correctly without certificates +$PERF_ANALYZER -v -u https://localhost:443 -i http -m graphdef_int32_int32_int32 \ + --ssl-https-verify-peer 0 \ + --ssl-https-verify-host 0 \ + -s ${STABILITY_THRESHOLD} \ + > ${CLIENT_LOG}.https_success 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.https_success + RET=1 +fi + +# Test that HTTP protocol with SSL fails with incorrect key +$PERF_ANALYZER -v -u https://localhost:443 -i http -m graphdef_int32_int32_int32 \ + --ssl-https-verify-peer 1 \ + --ssl-https-verify-host 2 \ + --ssl-https-ca-certificates-file ca.crt \ + --ssl-https-client-certificate-file client.crt \ + --ssl-https-client-certificate-type PEM \ + --ssl-https-private-key-file client2.key \ + --ssl-https-private-key-type PEM \ + -s ${STABILITY_THRESHOLD} \ + > ${CLIENT_LOG}.https_failure 2>&1 +if [ $? -eq 0 ]; then + cat ${CLIENT_LOG}.https_failure + echo -e "\n***\n*** Expected test failure\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_analyzer_capi/test.sh b/qa/L0_perf_analyzer_capi/test.sh new file mode 100755 index 0000000000..3e3f9e4af6 --- /dev/null +++ b/qa/L0_perf_analyzer_capi/test.sh @@ -0,0 +1,330 @@ +#!/bin/bash +# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# TESTS COPIED FROM L0_perf_analyzer/test.sh +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./perf_analyzer.log" +PERF_ANALYZER=../clients/perf_analyzer + +DATADIR=`pwd`/models +TESTDATADIR=`pwd`/test_data + +SERVER_LIBRARY_PATH=/opt/tritonserver + +FLOAT_DIFFSHAPE_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/float_data_with_shape.json +STRING_WITHSHAPE_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/string_data_with_shape.json +SEQ_JSONDATAFILE=`pwd`/../common/perf_analyzer_input_data_json/seq_data.json +SHAPETENSORADTAFILE=`pwd`/../common/perf_analyzer_input_data_json/shape_tensor_data.json + +ERROR_STRING="error | Request count: 0 | : 0 infer/sec" + +STABILITY_THRESHOLD="9999" + +source ../common/util.sh + +rm -f $CLIENT_LOG +rm -rf $DATADIR $TESTDATADIR $ENSEMBLE_DATADIR + +mkdir -p $DATADIR +# Copy fixed-shape models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/graphdef_int32_int32_int32 $DATADIR/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/graphdef_object_object_object $DATADIR/ + +# Copy a variable-shape models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/graphdef_object_int32_int32 $DATADIR/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/graphdef_int32_int32_float32 $DATADIR/ + +# Copy shape tensor models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_shapetensor_model_repository/plan_zero_1_float32_int32 $DATADIR/ + +# Copying ensemble including a sequential model +cp -r /data/inferenceserver/${REPO_VERSION}/qa_sequence_model_repository/savedmodel_sequence_object $DATADIR +cp -r /data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_sequence_model_repository/simple_savedmodel_sequence_object $DATADIR + +# Copying variable sequence model +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_sequence_model_repository/graphdef_sequence_float32 $DATADIR + +# Copying bls model with undefined variable +mkdir -p $DATADIR/bls_undefined/1 && \ + cp ../python_models/bls_undefined/model.py $DATADIR/bls_undefined/1/. && \ + cp ../python_models/bls_undefined/config.pbtxt $DATADIR/bls_undefined/. + +# Generating test data +mkdir -p $TESTDATADIR +for INPUT in INPUT0 INPUT1; do + for i in {1..16}; do + echo '1' >> $TESTDATADIR/${INPUT} + done +done + +RET=0 + +########## Test C API ############# +# Make sure tritonserver is not running first +set +e +SERVER_PID=$(pidof tritonserver) +if [ $? -ne 1 ]; then +echo -e "\n There was a previous instance of tritonserver, killing \n" + kill $SERVER_PID + wait $SERVER_PID +fi +set -e + +# Testing simple configuration +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 \ +--service-kind=triton_c_api \ +--model-repository=$DATADIR --triton-server-directory=$SERVER_LIBRARY_PATH \ +-s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 -t 1 -p2000 -b 1 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +#Testing with string input +$PERF_ANALYZER -v -m graphdef_object_object_object --string-data=1 -p2000 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# Testing with variable inputs +$PERF_ANALYZER -v -m graphdef_object_int32_int32 --input-data=$TESTDATADIR \ +--shape INPUT0:2,8 --shape INPUT1:2,8 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m graphdef_object_int32_int32 \ +--input-data=$STRING_WITHSHAPE_JSONDATAFILE \ +--shape INPUT0:2,8 --shape INPUT1:2,8 -p2000 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m graphdef_int32_int32_float32 --shape INPUT0:2,8,2 \ +--shape INPUT1:2,8,2 -p2000 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# Shape tensor I/O model (server needs the shape tensor on the CPU) +$PERF_ANALYZER -v -m plan_zero_1_float32_int32 --input-data=$SHAPETENSORADTAFILE \ +--shape DUMMY_INPUT0:4,4 -p2000 -b 8 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep ": 0 infer/sec\|: 0 usec" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_ANALYZER -v -m simple_savedmodel_sequence_object -p 2000 -t5 --sync \ +--input-data=$SEQ_JSONDATAFILE \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set +e +$PERF_ANALYZER -v -m graphdef_sequence_float32 --shape INPUT:2 \ +--input-data=$FLOAT_DIFFSHAPE_JSONDATAFILE \ +--input-data=$FLOAT_DIFFSHAPE_JSONDATAFILE -p2000 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH --sync >$CLIENT_LOG 2>&1 +if [ $? -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep -P "The supplied shape .+ is incompatible with the model's input shape" | wc -l) -eq 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +# Negative test for the async mode. +set +e +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 -t 1 -p2000 -b 1 -a \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $(cat $CLIENT_LOG | grep "not supported by triton_c_api service" | wc -l) -ne 1 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +for SHARED_MEMORY_TYPE in system cuda; do + $PERF_ANALYZER -v -m graphdef_int32_int32_int32 -t 1 -p2000 -b 1 \ + --shared-memory=$SHARED_MEMORY_TYPE \ + --service-kind=triton_c_api --model-repository=$DATADIR \ + --triton-server-directory=$SERVER_LIBRARY_PATH >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi +done + + +$PERF_ANALYZER -v -m graphdef_int32_int32_int32 --request-rate-range 1000:2000:500 -p1000 -b 1 \ +--service-kind=triton_c_api --model-repository=$DATADIR \ +--triton-server-directory=$SERVER_LIBRARY_PATH -s ${STABILITY_THRESHOLD} \ +>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +set +e +# Testing erroneous configuration +# This model is expected to fail +$PERF_ANALYZER -v -m bls_undefined --shape INPUT0:1048576 -t 64\ +--service-kind=triton_c_api \ +--model-repository=$DATADIR --triton-server-directory=$SERVER_LIBRARY_PATH \ +-s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 +if [ $? -ne 99 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +# Make sure server is not still running +set +e +SERVER_PID=$(pidof tritonserver) +if [ $? -eq 0 ]; then + echo -e "\n Tritonserver did not exit properly, killing \n" + kill $SERVER_PID + wait $SERVER_PID + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_perf_analyzer_doc_links/mkdocs.yml b/qa/L0_perf_analyzer_doc_links/mkdocs.yml new file mode 100644 index 0000000000..41a4bfe485 --- /dev/null +++ b/qa/L0_perf_analyzer_doc_links/mkdocs.yml @@ -0,0 +1,36 @@ +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +site_name: CI Test +use_directory_urls: False +docs_dir: "./docs" +plugins: + - htmlproofer + - search + +markdown_extensions: + - toc: + permalink: True diff --git a/qa/L0_perf_analyzer_doc_links/test.sh b/qa/L0_perf_analyzer_doc_links/test.sh new file mode 100755 index 0000000000..d0757bca9e --- /dev/null +++ b/qa/L0_perf_analyzer_doc_links/test.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +LOG="`pwd`/doc_links.log" +CONFIG="`pwd`/mkdocs.yml" +RET=0 + +# Download necessary packages +python3 -m pip install mkdocs +python3 -m pip install mkdocs-htmlproofer-plugin==0.10.3 + +#Download perf_analyzer docs +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +TRITON_PERF_ANALYZER_REPO_TAG="${TRITON_PERF_ANALYZER_REPO_TAG:=main}" +git clone -b ${TRITON_PERF_ANALYZER_REPO_TAG} ${TRITON_REPO_ORGANIZATION}/perf_analyzer.git +cp `pwd`/perf_analyzer/README.md . +cp -rf `pwd`/perf_analyzer/docs . + +# Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links. +# This breaks all links to cli commands throughout the docs. This will iterate over all +# files in the docs directory and remove -- and - at the start of options, which allows the +# tool to check links for correctness. +for file in `pwd`/docs/*.md +do + echo $file + sed -i 's/`-*/`/g' $file + sed -i 's/#-*/#/g' $file +done + +exec mkdocs serve -f $CONFIG > $LOG & +PID=$! +sleep 20 + +until [[ (-z `pgrep mkdocs`) ]]; do + kill -2 $PID + sleep 2 +done + +if [[ ! -z `grep "invalid url" $LOG` ]]; then + cat $LOG + RET=1 +fi + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test PASSED\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_perf_analyzer_ground_truth/test.sh b/qa/L0_perf_analyzer_ground_truth/test.sh new file mode 100755 index 0000000000..d5d78e63f4 --- /dev/null +++ b/qa/L0_perf_analyzer_ground_truth/test.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "${REPO_VERSION}" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +# Setup client/perf_analyzer +CLIENT_LOG="./perf_analyzer.log" +PERF_ANALYZER=../clients/perf_analyzer + +function check_perf_analyzer_error { + ERROR_STRING="error | Request count: 0 | : 0 infer/sec" + CLIENT_RET="$1" + if [ ${CLIENT_RET} -ne 0 ]; then + cat ${CLIENT_LOG} + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat ${CLIENT_LOG} | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat ${CLIENT_LOG} + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi +} + +# Checks that the model infer/sec performance is equal to an expected value +# +/- some tolerance. +# $1: csv result file from PA run +# $2: expected infer/sec value +# $3: tolerance for expected value equality +function check_performance { + # get the boundary values based on the tolerance percentage + MIN=$(python3 -c "print(${2} * (1 - ${3}))") + MAX=$(python3 -c "print(${2} * (1 + ${3}))") + + # delete all but the 2nd line in the resulting file + # then get the 2nd column value which is the infer/sec measurement + report_val=$(sed '2!d' $1 | awk -F ',' {'print $2'}) + + # check if within tolerance + ret=$(python3 -c "print(${report_val} >= ${MIN} and ${report_val} <= ${MAX})") + if [ "$ret" = "False" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi +} + +# Iterate over the grpc results to ensure gRPC times are greater than 0 +# $1: client log file +# example line: Avg gRPC time: 42648 usec (marshal 6 usec + response wait 42640 usec + unmarshal 2 usec) +function check_grpc_time { + grep "gRPC" $1 | awk '{print $4}' | while read -r line; do + if [ $line -eq 0 ]; then + RET=1 + fi + done +} + +# Create input_data.json to communicate the requested model delay +# $1: desired model delay +function create_input_data { + echo "{\"data\":[{\"INPUT0\" : [${1}]}]}" > input_data.json +} + +# Setup server +export CUDA_VISIBLE_DEVICES=0 +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" + +rm -f $SERVER_LOG $CLIENT_LOG +MODEL_DIR="./models" +rm -fr ${MODEL_DIR} && mkdir ${MODEL_DIR} +MODELS="ground_truth" + +for model in ${MODELS}; do + # Add version directory to each model if non-existent + mkdir -p "${MODEL_DIR}/${model}/1" + cp ../python_models/${model}/model.py ./models/${model}/1/model.py + cp ../python_models/${model}/config.pbtxt ./models/${model}/config.pbtxt +done + +# Run server +run_server +if [ "${SERVER_PID}" == "0" ]; then + echo -e "\n***\n*** Failed to start ${SERVER}\n***" + cat ${SERVER_LOG} + exit 1 +fi + +# Run perf_analyzer +set +e +RET=0 +PROTOCOLS="http grpc" +OUTPUT_FILE="results" +MODEL_DELAYS=(0.05 0.5) +TOLERANCE="0.05" + +for model_delay in ${MODEL_DELAYS[@]}; do + create_input_data ${model_delay} + EXPECTED_RESULT=$(python3 -c "print(1 / ${model_delay})") + for protocol in ${PROTOCOLS}; do + for model in ${MODELS}; do + echo "================================================================" + echo "[PERMUTATION] Protocol=${protocol} Model=${model}" + echo "================================================================" + + ${PERF_ANALYZER} -v -i ${protocol} --concurrency-range 2 --input-data input_data.json -m ${model} -f ${OUTPUT_FILE} | tee ${CLIENT_LOG} 2>&1 + check_perf_analyzer_error $? + + check_performance ${OUTPUT_FILE} ${EXPECTED_RESULT} ${TOLERANCE} + + if [ "${protocol}" == "grpc" ]; then + check_grpc_time ${CLIENT_LOG} + fi + done; + done; +done; + + +set -e + +# Cleanup +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo "=== START SERVER LOG ===" + cat ${SERVER_LOG} + echo "=== END SERVER LOG ===" + echo "=== START CLIENT LOG ===" + cat ${CLIENT_LOG} + echo "=== END CLIENT LOG ===" + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit ${RET} diff --git a/qa/L0_perf_analyzer_report/test.sh b/qa/L0_perf_analyzer_report/test.sh new file mode 100755 index 0000000000..469d11ce3a --- /dev/null +++ b/qa/L0_perf_analyzer_report/test.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "${REPO_VERSION}" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +# Setup client/perf_analyzer +CLIENT_LOG="./perf_analyzer.log" +PERF_ANALYZER=../clients/perf_analyzer + +function check_perf_analyzer_error { + ERROR_STRING="error | Request count: 0 | : 0 infer/sec" + CLIENT_RET="$1" + if [ ${CLIENT_RET} -ne 0 ]; then + cat ${CLIENT_LOG} + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + if [ $(cat ${CLIENT_LOG} | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then + cat ${CLIENT_LOG} + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi +} + +function check_cache_output { + # Validate cache info in perf_analyzer output + CACHE_STRING="Cache hit count" + if [ $(cat ${CLIENT_LOG} | grep -i "${CACHE_STRING}" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG} + echo "ERROR: No cache hit count found in output" + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + # Validate non-zero number of cache hits + ERROR_STRING="Cache hit count: 0" + num_cache_hit_lines=$(cat ${CLIENT_LOG} | grep -i "${CACHE_STRING}" | wc -l) + num_cache_hit_zero_lines=$(cat ${CLIENT_LOG} | grep -i "${ERROR_STRING}" | wc -l) + if [ ${num_cache_hit_zero_lines} -eq ${num_cache_hit_lines} ]; then + cat ${CLIENT_LOG} + echo "ERROR: All cache hit counts were zero, expected a non-zero number of cache hits" + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi +} + +# Setup server +export CUDA_VISIBLE_DEVICES=0 +SERVER=/opt/tritonserver/bin/tritonserver +# --response-cache-byte-size must be non-zero to test models with cache enabled +SERVER_ARGS="--model-repository=`pwd`/models --response-cache-byte-size=8192" +SERVER_LOG="./inference_server.log" + +# Setup model repository from existing qa_model_repository +rm -f $SERVER_LOG $CLIENT_LOG +MODEL_DIR="./models" +rm -fr ${MODEL_DIR} && mkdir ${MODEL_DIR} +ENSEMBLE_MODEL="simple_onnx_float32_float32_float32" +COMPOSING_MODEL="onnx_float32_float32_float32" +ENSEMBLE_MODEL_CACHE_ENABLED="${ENSEMBLE_MODEL}_cache_enabled" +ENSEMBLE_MODEL_CACHE_DISABLED="${ENSEMBLE_MODEL}_cache_disabled" +COMPOSING_MODEL_CACHE_ENABLED="${COMPOSING_MODEL}_cache_enabled" +COMPOSING_MODEL_CACHE_DISABLED="${COMPOSING_MODEL}_cache_disabled" +MODELS="${ENSEMBLE_MODEL_CACHE_ENABLED} ${ENSEMBLE_MODEL_CACHE_DISABLED} ${COMPOSING_MODEL_CACHE_ENABLED} ${COMPOSING_MODEL_CACHE_DISABLED}" + +## Setup ensemble models, one with cache enabled and one with cache disabled +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository/${ENSEMBLE_MODEL}" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_ENABLED}" +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository/${ENSEMBLE_MODEL}" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_DISABLED}" + +## Setup composing models, one with cache enabled and one with cache disabled +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_model_repository/${COMPOSING_MODEL}" "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_ENABLED}" +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_model_repository/${COMPOSING_MODEL}" "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_DISABLED}" + +for model in ${MODELS}; do + # Remove "name" line from each config to use directory name for simplicity + sed -i "/^name:/d" "${MODEL_DIR}/${model}/config.pbtxt" + # Add version directory to each model if non-existent + mkdir -p "${MODEL_DIR}/${model}/1" +done + +## Update "model_name" lines in each ensemble model config ensemble steps +sed -i "s/${COMPOSING_MODEL}/${COMPOSING_MODEL_CACHE_ENABLED}/g" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_ENABLED}/config.pbtxt" +sed -i "s/${COMPOSING_MODEL}/${COMPOSING_MODEL_CACHE_DISABLED}/g" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_DISABLED}/config.pbtxt" + +## Append cache config to each model config +echo -e "response_cache { enable: True }" >> "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_ENABLED}/config.pbtxt" +echo -e "response_cache { enable: False }" >> "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_DISABLED}/config.pbtxt" +echo -e "response_cache { enable: True }" >> "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_ENABLED}/config.pbtxt" +echo -e "response_cache { enable: False }" >> "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_DISABLED}/config.pbtxt" +# Force CPU memory for composing models since cache doesn't currently support GPU memory +echo -e "instance_group [{ kind: KIND_CPU, count: 1 }]" >> "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_ENABLED}/config.pbtxt" +echo -e "instance_group [{ kind: KIND_CPU, count: 1 }]" >> "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_DISABLED}/config.pbtxt" + +# Run server +run_server +if [ "${SERVER_PID}" == "0" ]; then + echo -e "\n***\n*** Failed to start ${SERVER}\n***" + cat ${SERVER_LOG} + exit 1 +fi + +# Run perf_analyzer +set +e +RET=0 +PROTOCOLS="http grpc" +STABILITY_THRESHOLD="15" +for protocol in ${PROTOCOLS}; do + for model in ${MODELS}; do + echo "================================================================" + echo "[PERMUTATION] Protocol=${protocol} Model=${model}" + echo "================================================================" + + ${PERF_ANALYZER} -v -i ${protocol} -m ${model} -s ${STABILITY_THRESHOLD} | tee ${CLIENT_LOG} 2>&1 + check_perf_analyzer_error $? + + # Check response cache outputs + if [[ ${model} == *"cache_enabled"* ]]; then + check_cache_output + fi + done; +done; +set -e + +# Cleanup +kill $SERVER_PID +wait $SERVER_PID + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo "=== START SERVER LOG ===" + cat ${SERVER_LOG} + echo "=== END SERVER LOG ===" + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit ${RET} diff --git a/qa/L0_perf_analyzer_unit_tests/test.sh b/qa/L0_perf_analyzer_unit_tests/test.sh new file mode 100755 index 0000000000..f2a70d23ff --- /dev/null +++ b/qa/L0_perf_analyzer_unit_tests/test.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TEST_LOG="./perf_analyzer_unit_tests.log" +PERF_ANALYZER_UNIT_TESTS=../clients/perf_analyzer_unit_tests + +RET=0 + +rm -f $TEST_LOG + +set +e +$PERF_ANALYZER_UNIT_TESTS >> $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_deeprecommender/run_test.sh b/qa/L0_perf_deeprecommender/run_test.sh new file mode 100755 index 0000000000..75fd68704d --- /dev/null +++ b/qa/L0_perf_deeprecommender/run_test.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +STATIC_BATCH_SIZES=${STATIC_BATCH_SIZES:=1} +DYNAMIC_BATCH_SIZES=${DYNAMIC_BATCH_SIZES:=1} +INSTANCE_COUNTS=${INSTANCE_COUNTS:=1} +TF_VERSION=${TF_VERSION:=2} + +PERF_CLIENT=../clients/perf_client +REPORTER=../common/reporter.py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --backend-config=tensorflow,version=${TF_VERSION}" +source ../common/util.sh + +# Select the single GPU that will be available to the inference +# server. Or use "export CUDA_VISIBLE_DEVICE=" to run on CPU. +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +for STATIC_BATCH in $STATIC_BATCH_SIZES; do + for DYNAMIC_BATCH in $DYNAMIC_BATCH_SIZES; do + for INSTANCE_CNT in $INSTANCE_COUNTS; do + if (( ($DYNAMIC_BATCH > 1) && ($STATIC_BATCH >= $DYNAMIC_BATCH) )); then + continue + fi + + MAX_BATCH=${STATIC_BATCH} && \ + (( $DYNAMIC_BATCH > $STATIC_BATCH )) && \ + MAX_BATCH=${DYNAMIC_BATCH} + + if (( $DYNAMIC_BATCH > 1 )); then + NAME=${MODEL_NAME}_sbatch${STATIC_BATCH}_dbatch${DYNAMIC_BATCH}_instance${INSTANCE_CNT}_${PERF_CLIENT_PROTOCOL} + else + NAME=${MODEL_NAME}_sbatch${STATIC_BATCH}_instance${INSTANCE_CNT}_${PERF_CLIENT_PROTOCOL} + fi + + rm -fr models && mkdir -p models && \ + cp -r $MODEL_PATH models/. && \ + (cd models/$MODEL_NAME && \ + sed -i "s/^max_batch_size:.*/max_batch_size: ${MAX_BATCH}/" config.pbtxt && \ + echo "instance_group [ { count: ${INSTANCE_CNT} }]" >> config.pbtxt) + if (( $DYNAMIC_BATCH > 1 )); then + (cd models/$MODEL_NAME && \ + echo "dynamic_batching { preferred_batch_size: [ ${DYNAMIC_BATCH} ] }" >> config.pbtxt) + fi + + echo "Time before starting server: $(date)" + SERVER_LOG="${NAME}.server.log" + run_server + if (( $SERVER_PID == 0 )); then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + echo "Time before perf analyzer trials: $(date)" + + # Run the model once to warm up. Some frameworks do + # optimization on the first requests. Must warmup similar + # to actual run so that all instances are ready + $PERF_CLIENT -v -i ${PERF_CLIENT_PROTOCOL} -m $MODEL_NAME -p5000 \ + -b${STATIC_BATCH} --concurrency-range ${CONCURRENCY} + + set -o pipefail + PA_MAX_TRIALS=${PA_MAX_TRIALS:-"50"} + $PERF_CLIENT -v -i ${PERF_CLIENT_PROTOCOL} -m $MODEL_NAME -p5000 \ + -b${STATIC_BATCH} --concurrency-range ${CONCURRENCY} \ + --max-trials "${PA_MAX_TRIALS}" \ + -f ${NAME}.csv 2>&1 | tee ${NAME}.log + if (( $? != 0 )); then + echo -e "\n***\n*** FAILED Perf Analyzer measurement\n***" + RET=1 + fi + echo "Time after perf analyzer trials: $(date)" + set +o pipefail + + curl localhost:8002/metrics -o ${NAME}.metrics >> ${NAME}.log 2>&1 + if (( $? != 0 )); then + echo -e "\n***\n*** FAILED to get metrics\n***" + RET=1 + fi + + set -e + + echo -e "[{\"s_benchmark_kind\":\"benchmark_perf\"," >> ${NAME}.tjson + echo -e "\"s_benchmark_name\":\"deeprecommender\"," >> ${NAME}.tjson + echo -e "\"s_server\":\"triton\"," >> ${NAME}.tjson + echo -e "\"s_protocol\":\"${PERF_CLIENT_PROTOCOL}\"," >> ${NAME}.tjson + echo -e "\"s_framework\":\"${MODEL_FRAMEWORK}\"," >> ${NAME}.tjson + echo -e "\"s_model\":\"${MODEL_NAME}\"," >> ${NAME}.tjson + echo -e "\"l_concurrency\":${CONCURRENCY}," >> ${NAME}.tjson + echo -e "\"l_dynamic_batch_size\":${DYNAMIC_BATCH}," >> ${NAME}.tjson + echo -e "\"l_batch_size\":${STATIC_BATCH}," >> ${NAME}.tjson + echo -e "\"l_instance_count\":${INSTANCE_CNT}}]" >> ${NAME}.tjson + + kill $SERVER_PID + wait $SERVER_PID + + if [ -f $REPORTER ]; then + set +e + + URL_FLAG= + if [ ! -z ${BENCHMARK_REPORTER_URL} ]; then + URL_FLAG="-u ${BENCHMARK_REPORTER_URL}" + fi + + $REPORTER -v -o ${NAME}.json --csv ${NAME}.csv ${URL_FLAG} ${NAME}.tjson + if (( $? != 0 )); then + RET=1 + fi + + set -e + fi + done + done +done + +if (( $RET == 0 )); then + echo -e "\n***\n*** $FRAMEWORK Test Passed\n***" +else + echo -e "\n***\n*** $FRAMEWORK Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_deeprecommender/test.sh b/qa/L0_perf_deeprecommender/test.sh new file mode 100755 index 0000000000..dc61d56e98 --- /dev/null +++ b/qa/L0_perf_deeprecommender/test.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +RET=0 +REPODIR=/data/inferenceserver/${REPO_VERSION} +TRTEXEC=/usr/src/tensorrt/bin/trtexec +MODEL="deeprecommender" +PROTOCOLS="grpc http" + +rm -f *.log *.csv *.metrics *.tjson *.json + +# +# Test minimum latency +# +STATIC_BATCH=1 +INSTANCE_CNT=1 +CONCURRENCY=1 + +# Create the TensorRT plan from ONNX +rm -fr tensorrt_models && mkdir -p tensorrt_models/deeprecommender_plan/0 && \ +cp $REPODIR/perf_model_store/deeprecommender_onnx/1/model.onnx tensorrt_models/deeprecommender_plan && \ +(cd tensorrt_models/deeprecommender_plan && \ +echo 'name: "deeprecommender_plan" +platform: "tensorrt_plan" +max_batch_size: ${STATIC_BATCH} +input [ + { + name: "Placeholder:0" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [17736,1,1] + } +] +output [ + { + name: "fc5/Relu:0" + data_type: TYPE_FP32 + dims: [17736] + } +]' >| config.pbtxt) + +$TRTEXEC --onnx=tensorrt_models/deeprecommender_plan/model.onnx --verbose \ + --saveEngine=tensorrt_models/deeprecommender_plan/0/model.plan \ + --minShapes=Placeholder:0:1x17736x1x1 \ + --optShapes=Placeholder:0:${STATIC_BATCH}x17736x1x1 \ + --maxShapes=Placeholder:0:${STATIC_BATCH}x17736x1x1 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to generate TensorRT Plan \n***" + exit 1 +fi +rm tensorrt_models/deeprecommender_plan/model.onnx + +OPTIMIZED_MODEL_NAMES="deeprecommender_graphdef_trt" + +# Create optimized models (TF-TRT and ONNX-TRT) +rm -fr optimized_model_store && mkdir optimized_model_store +for MODEL_NAME in $OPTIMIZED_MODEL_NAMES; do + BASE_MODEL=$(echo ${MODEL_NAME} | cut -d '_' -f 1,2) + cp -r $REPODIR/perf_model_store/${BASE_MODEL} optimized_model_store/${MODEL_NAME} + CONFIG_PATH="optimized_model_store/${MODEL_NAME}/config.pbtxt" + sed -i "s/^name: \"${BASE_MODEL}\"/name: \"${MODEL_NAME}\"/" ${CONFIG_PATH} + echo "optimization { execution_accelerators {" >> ${CONFIG_PATH} + echo "gpu_execution_accelerator : [ {" >> ${CONFIG_PATH} + echo "name : \"tensorrt\" " >> ${CONFIG_PATH} + echo "} ]" >> ${CONFIG_PATH} + echo "}}" >> ${CONFIG_PATH} +done + +# Tests with each model +for FRAMEWORK in graphdef plan graphdef_trt onnx libtorch; do + MODEL_NAME=${MODEL}_${FRAMEWORK} + if [ "$FRAMEWORK" == "plan" ]; then + REPO=`pwd`/tensorrt_models + elif [[ "$FRAMEWORK" == *"_trt" ]]; then + REPO=`pwd`/optimized_model_store + else + REPO=$REPODIR/perf_model_store + fi + for PROTOCOL in $PROTOCOLS; do + MODEL_NAME=${MODEL_NAME} \ + MODEL_FRAMEWORK=${FRAMEWORK} \ + MODEL_PATH="$REPO/${MODEL_NAME}" \ + STATIC_BATCH_SIZES=${STATIC_BATCH} \ + DYNAMIC_BATCH_SIZES=1 \ + PERF_CLIENT_PROTOCOL=${PROTOCOL} \ + INSTANCE_COUNTS=${INSTANCE_CNT} \ + CONCURRENCY=${CONCURRENCY} \ + bash -x run_test.sh + if [ $? -ne 0 ]; then + RET=1 + fi + done +done + +# +# Test large static batch = 256 w/ 2 instances +# +STATIC_BATCH=256 +INSTANCE_CNT=2 +CONCURRENCY=4 + +# Create the TensorRT plan from ONNX +rm -fr tensorrt_models && mkdir -p tensorrt_models/deeprecommender_plan/0 && \ +cp $REPODIR/perf_model_store/deeprecommender_onnx/1/model.onnx tensorrt_models/deeprecommender_plan && \ +(cd tensorrt_models/deeprecommender_plan && \ +echo 'name: "deeprecommender_plan" +platform: "tensorrt_plan" +max_batch_size: ${STATIC_BATCH} +input [ + { + name: "Placeholder:0" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [17736,1,1] + } +] +output [ + { + name: "fc5/Relu:0" + data_type: TYPE_FP32 + dims: [17736] + } +]' >| config.pbtxt) + +$TRTEXEC --onnx=tensorrt_models/deeprecommender_plan/model.onnx --verbose \ + --saveEngine=tensorrt_models/deeprecommender_plan/0/model.plan \ + --minShapes=Placeholder:0:1x17736x1x1 \ + --optShapes=Placeholder:0:${STATIC_BATCH}x17736x1x1 \ + --maxShapes=Placeholder:0:${STATIC_BATCH}x17736x1x1 + +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to generate TensorRT Plan \n***" + exit 1 +fi +rm tensorrt_models/deeprecommender_plan/model.onnx + +# Tests with each model +for FRAMEWORK in graphdef plan graphdef_trt onnx libtorch; do + MODEL_NAME=${MODEL}_${FRAMEWORK} + if [ "$FRAMEWORK" == "plan" ]; then + REPO=`pwd`/tensorrt_models + elif [[ "$FRAMEWORK" == *"_trt" ]]; then + REPO=`pwd`/optimized_model_store + else + REPO=$REPODIR/perf_model_store + fi + for PROTOCOL in $PROTOCOLS; do + MODEL_NAME=${MODEL_NAME} \ + MODEL_FRAMEWORK=${FRAMEWORK} \ + MODEL_PATH="$REPO/${MODEL_NAME}" \ + STATIC_BATCH_SIZES=${STATIC_BATCH} \ + DYNAMIC_BATCH_SIZES=1 \ + PERF_CLIENT_PROTOCOL=${PROTOCOL} \ + INSTANCE_COUNTS=${INSTANCE_CNT} \ + CONCURRENCY=${CONCURRENCY} \ + bash -x run_test.sh + if [ $? -ne 0 ]; then + RET=1 + fi + done +done + +if (( $RET == 0 )); then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi diff --git a/qa/L0_perf_kaldi/create_data.sh b/qa/L0_perf_kaldi/create_data.sh new file mode 100755 index 0000000000..849b56d906 --- /dev/null +++ b/qa/L0_perf_kaldi/create_data.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Needs to be run in asr_kaldi main directory and must be copied to +# draco for benchmark test +TRITON_VERSION="20.05" + +nvidia-docker run --rm \ + --shm-size=1g \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -v $PWD/data:/mnt/data \ + gitlab-master.nvidia.com:5005/dl/joc/asr_kaldi:${TRITON_VERSION}-server-py3-devel \ + /workspace/scripts/docker/dataset_setup.sh $(id -u) $(id -g) diff --git a/qa/L0_perf_kaldi/test.sh b/qa/L0_perf_kaldi/test.sh new file mode 100755 index 0000000000..31d4c99ee6 --- /dev/null +++ b/qa/L0_perf_kaldi/test.sh @@ -0,0 +1,127 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Test with 20.05 because kaldi image for 20.06 is not yet available +TRITON_VERSION="20.05" + +cd /workspace +git clone --single-branch --depth=1 -b r${TRITON_VERSION} \ + https://github.com/NVIDIA/triton-inference-server.git + +echo "add_subdirectory(kaldi-asr-client)" >> triton-inference-server/src/clients/c++/CMakeLists.txt + +cp -r asr_kaldi/kaldi-asr-client triton-inference-server/src/clients/c++ +cp -r asr_kaldi/model-repo/kaldi_online/config.pbtxt model-repo/kaldi_online/ + +# Client dependencies +(apt-get update && \ + apt-get install -y --no-install-recommends \ + libssl-dev \ + libb64-dev \ + rapidjson-dev) + +pip3 install --upgrade wheel setuptools grpcio-tools + +# Build client library and kaldi perf client +(cd triton-inference-server/build && \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX:PATH=/workspace/install && \ + make -j16 trtis-clients) + +RET=0 +rm -rf *.log + +# Run server +/opt/tritonserver/bin/trtserver --model-repo=/workspace/model-repo > server.log 2>&1 & +SERVER_PID=$! +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start server\n***" + cat server.log + exit 1 +fi + +KALDI_CLIENT=install/bin/kaldi_asr_parallel_client + +# Run client +RESULTS_DIR="/data/results" +mkdir -p $RESULTS_DIR + +CONCURRENCY=2000 + +# Client only supports GRPC (5 iterations on the dataset) +$KALDI_CLIENT -i 5 -c ${CONCURRENCY} >> client_1.log 2>&1 +if (( $? != 0 )); then + RET=1 +fi + +# Capture Throughput +THROUGHPUT=`cat client_1.log | grep 'Throughput:' | cut -f 2 | cut -f 1 -d ' '` + +# '-o' Flag is needed to run online and capture latency +$KALDI_CLIENT -i 5 -c ${CONCURRENCY} -o >> client_2.log 2>&1 +if (( $? != 0 )); then + RET=1 +fi + +# Capture Latency 95 percentile +LATENCY_95=`cat client_2.log | grep -A1 "Latencies:" | sed -n '2 p' | cut -f 5` + +REPORTER=triton-inference-server/qa/common/reporter.py + +echo -e "[{\"s_benchmark_kind\":\"benchmark_perf\"," >> results.tjson +echo -e "\"s_benchmark_name\":\"kaldi\"," >> results.tjson +echo -e "\"s_server\":\"triton\"," >> results.tjson +echo -e "\"s_protocol\":\"grpc\"," >> results.tjson +echo -e "\"s_model\":\"asr_kaldi\"," >> results.tjson +echo -e "\"l_concurrency\":${CONCURRENCY}," >> results.tjson +echo -e "\"d_infer_per_sec\":${THROUGHPUT}," >> results.tjson +echo -e "\"d_latency_p95_ms\":${LATENCY_95}," >> results.tjson +echo -e "\"l_instance_count\":1}]" >> results.tjson + +if [ -f $REPORTER ]; then + set +e + + URL_FLAG= + if [ ! -z ${BENCHMARK_REPORTER_URL} ]; then + URL_FLAG="-u ${BENCHMARK_REPORTER_URL}" + fi + + $REPORTER -v -o results.json ${URL_FLAG} results.tjson + if (( $? != 0 )); then + RET=1 + fi + + set -e +fi + +if (( $RET == 0 )); then + echo -e "\n***\n*** ASR Kaldi Benchmark Passed\n***" +else + echo -e "\n***\n*** ASR Kaldi Benchmark FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_nomodel/custom_models/custom_zero_1_float32/config.pbtxt b/qa/L0_perf_nomodel/custom_models/custom_zero_1_float32/config.pbtxt new file mode 100644 index 0000000000..de852749c0 --- /dev/null +++ b/qa/L0_perf_nomodel/custom_models/custom_zero_1_float32/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_zero_1_float32" +backend: "identity" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] diff --git a/qa/L0_perf_nomodel/run_test.sh b/qa/L0_perf_nomodel/run_test.sh new file mode 100755 index 0000000000..b1e2702ecb --- /dev/null +++ b/qa/L0_perf_nomodel/run_test.sh @@ -0,0 +1,252 @@ +#!/bin/bash +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=$1 + +BACKENDS=${BACKENDS:="plan custom graphdef savedmodel onnx libtorch python"} +STATIC_BATCH_SIZES=${STATIC_BATCH_SIZES:=1} +DYNAMIC_BATCH_SIZES=${DYNAMIC_BATCH_SIZES:=1} +INSTANCE_COUNTS=${INSTANCE_COUNTS:=1} +CONCURRENCY=${CONCURRENCY:=1} + +PERF_CLIENT_PROTOCOL=${PERF_CLIENT_PROTOCOL:=grpc} +PERF_CLIENT_PERCENTILE=${PERF_CLIENT_PERCENTILE:=95} +PERF_CLIENT_STABILIZE_WINDOW=${PERF_CLIENT_STABILIZE_WINDOW:=5000} +PERF_CLIENT_STABILIZE_THRESHOLD=${PERF_CLIENT_STABILIZE_THRESHOLD:=5} +TENSOR_SIZE=${TENSOR_SIZE:=1} +SHARED_MEMORY=${SHARED_MEMORY:="none"} +REPORTER=../common/reporter.py + +RESULTDIR=${RESULTDIR:=.} + +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +ARCH=${ARCH:="x86_64"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +MODEL_REPO="${PWD}/models" +PERF_CLIENT=../clients/perf_client +TF_VERSION=${TF_VERSION:=2} +SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR} --backend-config=tensorflow,version=${TF_VERSION}" +source ../common/util.sh + +# DATADIR is already set in environment variable for aarch64 +if [ "$ARCH" != "aarch64" ]; then + DATADIR="/data/inferenceserver/${REPO_VERSION}" +fi + +# Select the single GPU that will be available to the inference server +export CUDA_VISIBLE_DEVICES=0 + +mkdir -p ${RESULTDIR} +RET=0 + +if [[ $BACKENDS == *"python"* ]]; then + cp /opt/tritonserver/backends/python/triton_python_backend_utils.py . + + mkdir -p python_models/python_zero_1_float32/1 && \ + cp ../python_models/identity_fp32/model.py ./python_models/python_zero_1_float32/1/model.py && \ + cp ../python_models/identity_fp32/config.pbtxt ./python_models/python_zero_1_float32/config.pbtxt + (cd python_models/python_zero_1_float32 && \ + sed -i "s/^name:.*/name: \"python_zero_1_float32\"/" config.pbtxt) +fi + +if [[ $BACKENDS == *"custom"* ]]; then + mkdir -p "custom_models/custom_zero_1_float32/1" +fi + +PERF_CLIENT_PERCENTILE_ARGS="" && + (( ${PERF_CLIENT_PERCENTILE} != 0 )) && + PERF_CLIENT_PERCENTILE_ARGS="--percentile=${PERF_CLIENT_PERCENTILE}" +PERF_CLIENT_EXTRA_ARGS="$PERF_CLIENT_PERCENTILE_ARGS --shared-memory ${SHARED_MEMORY}" + +# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and +# reporting structure, though "triton_c_api" is not strictly a "protocol". +if [[ "${PERF_CLIENT_PROTOCOL}" == "triton_c_api" ]]; then + # Server will be run in-process with C API + SERVICE_ARGS="--service-kind triton_c_api \ + --triton-server-directory ${TRITON_DIR} \ + --model-repository ${MODEL_REPO}" +else + SERVICE_ARGS="-i ${PERF_CLIENT_PROTOCOL}" +fi + +# +# Use "identity" model for all model types. +# +for BACKEND in $BACKENDS; do + for STATIC_BATCH in $STATIC_BATCH_SIZES; do + for DYNAMIC_BATCH in $DYNAMIC_BATCH_SIZES; do + for INSTANCE_CNT in $INSTANCE_COUNTS; do + if (( ($DYNAMIC_BATCH > 1) && ($STATIC_BATCH >= $DYNAMIC_BATCH) )); then + continue + fi + + # plan and openvino models do not support 16MB I/O tests + if ([ $BACKEND == "plan" ] || [ $BACKEND == "openvino" ]) && [ $TENSOR_SIZE != 1 ]; then + continue + fi + + # set input name (special case for libtorch model) + INPUT_NAME="INPUT0" && [ $BACKEND == "libtorch" ] && INPUT_NAME="INPUT__0" + + MAX_LATENCY=300 + MAX_BATCH=${STATIC_BATCH} && [ $DYNAMIC_BATCH > $STATIC_BATCH ] && MAX_BATCH=${DYNAMIC_BATCH} + + # TODO Add openvino identity model that supports batching/dynamic batching + # The current openvino identity model does also not support batching + if [ $BACKEND == "openvino" ]; then + if [ $MAX_BATCH != 1 ]; then + continue + else + MAX_BATCH=0 + fi + fi + + if [ $DYNAMIC_BATCH > 1 ]; then + NAME=${BACKEND}_sbatch${STATIC_BATCH}_dbatch${DYNAMIC_BATCH}_instance${INSTANCE_CNT} + else + NAME=${BACKEND}_sbatch${STATIC_BATCH}_instance${INSTANCE_CNT} + fi + + # set model name (special case for openvino i.e. nobatch) + MODEL_NAME=${BACKEND}_zero_1_float32 && [ $BACKEND == "openvino" ] && MODEL_NAME=${BACKEND}_nobatch_zero_1_float32 + + if [ $BACKEND == "custom" ]; then + REPO_DIR=./custom_models + elif [ $BACKEND == "python" ]; then + REPO_DIR=./python_models + else + REPO_DIR=$DATADIR/qa_identity_model_repository + fi + + SHAPE=${TENSOR_SIZE} + KIND="KIND_GPU" && [ $BACKEND == "custom" ] || [ $BACKEND == "python" ] || [ $BACKEND == "openvino" ] && KIND="KIND_CPU" + + rm -fr models && mkdir -p models && \ + cp -r $REPO_DIR/$MODEL_NAME models/. && \ + (cd models/$MODEL_NAME && \ + sed -i "s/^max_batch_size:.*/max_batch_size: ${MAX_BATCH}/" config.pbtxt) + + # python model already has instance count and kind + if [ $BACKEND == "python" ]; then + (cd models/$MODEL_NAME && \ + sed -i "s/count:.*/count: ${INSTANCE_CNT}/" config.pbtxt) + else + (cd models/$MODEL_NAME && \ + echo "instance_group [ { kind: ${KIND}, count: ${INSTANCE_CNT} }]" >> config.pbtxt) + fi + + if [ $BACKEND == "custom" ]; then + (cd models/$MODEL_NAME && \ + sed -i "s/dims:.*\[.*\]/dims: \[ ${SHAPE} \]/g" config.pbtxt) + fi + if [ $DYNAMIC_BATCH > 1 ] && [ $BACKEND != "openvino" ]; then + (cd models/$MODEL_NAME && \ + echo "dynamic_batching { preferred_batch_size: [ ${DYNAMIC_BATCH} ] }" >> config.pbtxt) + fi + + echo "Time before starting server: $(date)" + # Only start separate server if not using C API, since C API runs server in-process + if [[ "${PERF_CLIENT_PROTOCOL}" != "triton_c_api" ]]; then + SERVER_LOG="${RESULTDIR}/${NAME}.server.log" + run_server + if [ $SERVER_PID == 0 ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + fi + + echo "Time before perf analyzer trials: $(date)" + set +e + set -o pipefail + PA_MAX_TRIALS=${PA_MAX_TRIALS:-"50"} + $PERF_CLIENT -v \ + -p${PERF_CLIENT_STABILIZE_WINDOW} \ + -s${PERF_CLIENT_STABILIZE_THRESHOLD} \ + ${PERF_CLIENT_EXTRA_ARGS} \ + -m ${MODEL_NAME} \ + -b${STATIC_BATCH} -t${CONCURRENCY} \ + --max-trials "${PA_MAX_TRIALS}" \ + --shape ${INPUT_NAME}:${SHAPE} \ + ${SERVICE_ARGS} \ + -f ${RESULTDIR}/${NAME}.csv 2>&1 | tee ${RESULTDIR}/${NAME}.log + if [ $? -ne 0 ]; then + echo -e "\n***\n*** FAILED Perf Analyzer measurement\n***" + RET=1 + fi + echo "Time after perf analyzer trials: $(date)" + set +o pipefail + set -e + + echo -e "[{\"s_benchmark_kind\":\"benchmark_perf\"," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"s_benchmark_name\":\"nomodel\"," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"s_server\":\"triton\"," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"s_protocol\":\"${PERF_CLIENT_PROTOCOL}\"," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"s_framework\":\"${BACKEND}\"," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"s_model\":\"${MODEL_NAME}\"," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"l_concurrency\":${CONCURRENCY}," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"l_dynamic_batch_size\":${DYNAMIC_BATCH}," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"l_batch_size\":${STATIC_BATCH}," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"l_size\":${TENSOR_SIZE}," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"s_shared_memory\":\"${SHARED_MEMORY}\"," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"l_instance_count\":${INSTANCE_CNT}," >> ${RESULTDIR}/${NAME}.tjson + echo -e "\"s_architecture\":\"${ARCH}\"}]" >> ${RESULTDIR}/${NAME}.tjson + + # SERVER_PID may not be set if using "triton_c_api" for example + if [[ -n "${SERVER_PID}" ]]; then + kill $SERVER_PID + wait $SERVER_PID + fi + + if [ -f $REPORTER ]; then + set +e + + URL_FLAG= + if [ ! -z ${BENCHMARK_REPORTER_URL} ]; then + URL_FLAG="-u ${BENCHMARK_REPORTER_URL}" + fi + + $REPORTER -v -o ${RESULTDIR}/${NAME}.json --csv ${RESULTDIR}/${NAME}.csv ${URL_FLAG} ${RESULTDIR}/${NAME}.tjson + if [ $? -ne 0 ]; then + RET=1 + fi + + set -e + fi + done + done + done +done + +if [ $RET == 0 ]; then + echo -e "\n***\n*** Test ${RESULTNAME} Passed\n***" +else + echo -e "\n***\n*** Test ${RESULTNAME} FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_nomodel/test.sh b/qa/L0_perf_nomodel/test.sh new file mode 100755 index 0000000000..6ff68303ed --- /dev/null +++ b/qa/L0_perf_nomodel/test.sh @@ -0,0 +1,225 @@ +#!/bin/bash +# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +rm -f *.log *.csv *.tjson *.json + +# Descriptive name for the current results +UNDERTEST_NAME=${NVIDIA_TRITON_SERVER_VERSION} + +# Confidence percentile to use when stabilizing and reporting +# results. A value of 0 indicates that average value should be used +# for stabilizing results. +PERF_CLIENT_PERCENTILE=${PERF_CLIENT_PERCENTILE:=95} + +# Threshold, as a percentage, to mark any performance change as a +# speedup or a slowdown. +PERF_CLIENT_SPEEDUP_THRESHOLD=5.0 +PERF_CLIENT_SLOWDOWN_THRESHOLD=5.0 + +# Length of window, in milliseconds, to use when stabilizing latency +# and infer/sec results. +PERF_CLIENT_STABILIZE_WINDOW=10000 + +# Threshold, as a percentage, to use when stabilizing latency and +# infer/sec results. Values must vary by less than this percent over 3 +# measurement windows to be considered value. +PERF_CLIENT_STABILIZE_THRESHOLD=15.0 + +RUNTEST=./run_test.sh + +# The model used for data collection has a single input and a single +# output. The model does minimal work (just copy input to +# output). TENSOR_SIZE is the number of elements in the model input +# and the model output. The tensor element type is float so to get the +# number of elements in each tensor need to divide the test I/O size +# by 4. +TENSOR_SIZE_16MB=$((4*1024*1024)) + +if [ "$BENCHMARK_TEST_SHARED_MEMORY" == "system" ]; then + UNDERTEST_NAME="$UNDERTEST_NAME System Shared Memory"; + SUFFIX="_shm" +elif [ "$BENCHMARK_TEST_SHARED_MEMORY" == "cuda" ]; then + UNDERTEST_NAME="$UNDERTEST_NAME CUDA Shared Memory"; + SUFFIX="_cudashm" +else + BENCHMARK_TEST_SHARED_MEMORY="none" + TEST_NAMES=( + "${UNDERTEST_NAME} Minimum Latency GRPC" + "${UNDERTEST_NAME} Minimum Latency HTTP" + "${UNDERTEST_NAME} Minimum Latency C API" + "${UNDERTEST_NAME} Maximum Throughput GRPC" + "${UNDERTEST_NAME} Maximum Throughput HTTP" + "${UNDERTEST_NAME} Maximum Throughput C API") + TEST_DIRS=( + min_latency_grpc + min_latency_http + min_latency_triton_c_api + max_throughput_grpc + max_throughput_http + max_throughput_triton_c_api) + SUFFIX="" + TEST_CONCURRENCY=( + 1 + 1 + 1 + 16 + 16 + 16) + TEST_INSTANCE_COUNTS=( + 1 + 1 + 1 + 2 + 2 + 2) + # Small payloads + TEST_TENSOR_SIZES=( + 1 + 1 + 1 + 1 + 1 + 1) + TEST_PROTOCOLS=( + grpc + http + triton_c_api + grpc + http + triton_c_api) +fi +TEST_NAMES+=( + "${UNDERTEST_NAME} 16MB I/O Latency GRPC" + "${UNDERTEST_NAME} 16MB I/O Latency HTTP" + "${UNDERTEST_NAME} 16MB I/O Latency C API" + "${UNDERTEST_NAME} 16MB I/O Throughput GRPC" + "${UNDERTEST_NAME} 16MB I/O Throughput HTTP" + "${UNDERTEST_NAME} 16MB I/O Throughput C API") +TEST_DIRS+=( + 16mb_latency_grpc${SUFFIX} + 16mb_latency_http${SUFFIX} + 16mb_latency_triton_c_api${SUFFIX} + 16mb_throughput_grpc${SUFFIX} + 16mb_throughput_http${SUFFIX} + 16mb_throughput_triton_c_api${SUFFIX}) +TEST_PROTOCOLS+=( + grpc + http + triton_c_api + grpc + http + triton_c_api) +# Large payloads +TEST_TENSOR_SIZES+=( + ${TENSOR_SIZE_16MB} + ${TENSOR_SIZE_16MB} + ${TENSOR_SIZE_16MB} + ${TENSOR_SIZE_16MB} + ${TENSOR_SIZE_16MB} + ${TENSOR_SIZE_16MB}) +TEST_INSTANCE_COUNTS+=( + 1 + 1 + 1 + 2 + 2 + 2) +TEST_CONCURRENCY+=( + 1 + 1 + 1 + 16 + 16 + 16) +TEST_BACKENDS=${BACKENDS:="plan custom graphdef savedmodel onnx libtorch python"} + +mkdir -p ${REPO_VERSION} + +# +# Run Performance tests +# + +RET=0 +set +e + +for idx in "${!TEST_NAMES[@]}"; do + TEST_NAME=${TEST_NAMES[$idx]} + TEST_DIR=${TEST_DIRS[$idx]} + TEST_PROTOCOL=${TEST_PROTOCOLS[$idx]} + TEST_TENSOR_SIZE=${TEST_TENSOR_SIZES[$idx]} + TEST_INSTANCE_COUNT=${TEST_INSTANCE_COUNTS[$idx]} + TEST_CONCURRENCY=${TEST_CONCURRENCY[$idx]} + + # FIXME: If PA C API adds SHMEM support, remove this. + if [[ "${BENCHMARK_TEST_SHARED_MEMORY}" != "none" ]] && \ + [[ "${TEST_PROTOCOL}" == "triton_c_api" ]]; then + echo "WARNING: Perf Analyzer does not support shared memory I/O when benchmarking directly with Triton C API, skipping." + continue + fi + + RESULTNAME=${TEST_NAME} \ + RESULTDIR=${REPO_VERSION}/${TEST_DIR} \ + PERF_CLIENT_PERCENTILE=${PERF_CLIENT_PERCENTILE} \ + PERF_CLIENT_STABILIZE_WINDOW=${PERF_CLIENT_STABILIZE_WINDOW} \ + PERF_CLIENT_STABILIZE_THRESHOLD=${PERF_CLIENT_STABILIZE_THRESHOLD} \ + PERF_CLIENT_PROTOCOL=${TEST_PROTOCOL} \ + TENSOR_SIZE=${TEST_TENSOR_SIZE} \ + BACKENDS=${TEST_BACKENDS} \ + SHARED_MEMORY=${BENCHMARK_TEST_SHARED_MEMORY} \ + STATIC_BATCH_SIZES=1 \ + DYNAMIC_BATCH_SIZES=1 \ + INSTANCE_COUNTS=${TEST_INSTANCE_COUNT} \ + CONCURRENCY=${TEST_CONCURRENCY} \ + bash -x ${RUNTEST} ${REPO_VERSION} + if (( $? != 0 )); then + RET=1 + fi +done + +set -e + +if (( $RET == 0 )); then + echo -e "\n***\n*** Data Collection Passed\n***" +else + echo -e "\n***\n*** Data Collection FAILED\n***" + exit $RET +fi + +exit $RET diff --git a/qa/L0_perf_pyclients/custom_models/custom_zero_1_int32/config.pbtxt b/qa/L0_perf_pyclients/custom_models/custom_zero_1_int32/config.pbtxt new file mode 100644 index 0000000000..c5d18c442d --- /dev/null +++ b/qa/L0_perf_pyclients/custom_models/custom_zero_1_int32/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_zero_1_int32" +backend: "identity" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +instance_group { + count: 1 + kind:KIND_CPU +} diff --git a/qa/L0_perf_pyclients/simple_perf_client.py b/qa/L0_perf_pyclients/simple_perf_client.py new file mode 100755 index 0000000000..fd02f94887 --- /dev/null +++ b/qa/L0_perf_pyclients/simple_perf_client.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys +import time + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException, triton_to_np_dtype + +FLAGS = None + + +def parse_model_grpc(model_metadata, model_config): + """ + Check the configuration of a model to make sure it is supported + by this client. + """ + if len(model_metadata.inputs) != 1: + raise Exception("expecting 1 input, got {}".format(len(model_metadata.inputs))) + if len(model_metadata.outputs) != 1: + raise Exception( + "expecting 1 output, got {}".format(len(model_metadata.outputs)) + ) + + if len(model_config.input) != 1: + raise Exception( + "expecting 1 input in model configuration, got {}".format( + len(model_config.input) + ) + ) + + input_metadata = model_metadata.inputs[0] + output_metadata = model_metadata.outputs[0] + + batch_dim = model_config.max_batch_size > 0 + expected_dims = 1 + (1 if batch_dim else 0) + + if len(input_metadata.shape) != expected_dims: + raise Exception( + "expecting input to have {} dimensions, model '{}' input has {}".format( + expected_dims, model_metadata.name, len(input_metadata.shape) + ) + ) + + if len(output_metadata.shape) != expected_dims: + raise Exception( + "expecting output to have {} dimensions, model '{}' output has {}".format( + expected_dims, model_metadata.name, len(output_metadata.shape) + ) + ) + + if input_metadata.shape[-1] != -1: + raise Exception( + "expecting input to have variable shape [-1], model '{}' input has {}".format( + model_metadata.name, input_metadata.shape + ) + ) + + if output_metadata.shape[-1] != -1: + raise Exception( + "expecting output to have variable shape [-1], model '{}' output has {}".format( + model_metadata.name, output_metadata.shape + ) + ) + + return ( + model_config.max_batch_size, + input_metadata.name, + output_metadata.name, + input_metadata.datatype, + ) + + +def parse_model_http(model_metadata, model_config): + """ + Check the configuration of a model to make sure it is supported + by this client. + """ + if len(model_metadata["inputs"]) != 1: + raise Exception( + "expecting 1 input, got {}".format(len(model_metadata["inputs"])) + ) + if len(model_metadata["outputs"]) != 1: + raise Exception( + "expecting 1 output, got {}".format(len(model_metadata["outputs"])) + ) + + if len(model_config["input"]) != 1: + raise Exception( + "expecting 1 input in model configuration, got {}".format( + len(model_config["input"]) + ) + ) + + input_metadata = model_metadata["inputs"][0] + output_metadata = model_metadata["outputs"][0] + + max_batch_size = 0 + if "max_batch_size" in model_config: + max_batch_size = model_config["max_batch_size"] + + batch_dim = max_batch_size > 0 + expected_dims = 1 + (1 if batch_dim else 0) + + if len(input_metadata["shape"]) != expected_dims: + raise Exception( + "expecting input to have {} dimensions, model '{}' input has {}".format( + expected_dims, model_metadata.name, len(input_metadata["shape"]) + ) + ) + + if len(output_metadata["shape"]) != expected_dims: + raise Exception( + "expecting output to have {} dimensions, model '{}' output has {}".format( + expected_dims, model_metadata.name, len(output_metadata["shape"]) + ) + ) + + if input_metadata["shape"][-1] != -1: + raise Exception( + "expecting input to have variable shape [-1], model '{}' input has {}".format( + model_metadata.name, input_metadata["shape"] + ) + ) + + if output_metadata["shape"][-1] != -1: + raise Exception( + "expecting output to have variable shape [-1], model '{}' output has {}".format( + model_metadata.name, output_metadata["shape"] + ) + ) + + return ( + max_batch_size, + input_metadata["name"], + output_metadata["name"], + input_metadata["datatype"], + ) + + +def requestGenerator(input_name, input_data, output_name, dtype, protocol): + # Set the input data + inputs = [] + if protocol.lower() == "grpc": + inputs.append(grpcclient.InferInput(input_name, input_data.shape, dtype)) + inputs[0].set_data_from_numpy(input_data) + else: + inputs.append(httpclient.InferInput(input_name, input_data.shape, dtype)) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + + outputs = [] + if protocol.lower() == "grpc": + outputs.append(grpcclient.InferRequestedOutput(output_name)) + else: + outputs.append(httpclient.InferRequestedOutput(output_name, binary_data=True)) + + return inputs, outputs + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-m", "--model-name", type=str, required=True, help="Name of model" + ) + parser.add_argument( + "-x", + "--model-version", + type=str, + required=False, + default="", + help="Version of model. Default is to use latest version.", + ) + parser.add_argument( + "-b", + "--batch-size", + type=int, + required=False, + default=1, + help="Batch size. Default is 1.", + ) + parser.add_argument( + "-s", + "--shape", + type=int, + required=False, + default=1, + help="The shape of the tensor. Default is 1.", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="HTTP", + help="Protocol (HTTP/gRPC) used to communicate with " + + "the inference service. Default is HTTP.", + ) + parser.add_argument( + "-c", + "--iteration_count", + type=int, + required=False, + default=1000, + help="The number of iterations. Default is 1000.", + ) + parser.add_argument( + "-w", + "--warmup_count", + type=int, + required=False, + default=500, + help="The number of warm-up iterations. Default is 500.", + ) + parser.add_argument( + "--csv", + type=str, + required=False, + default=None, + help="The name of the file to store the results in CSV format", + ) + FLAGS = parser.parse_args() + + try: + if FLAGS.protocol.lower() == "grpc": + # Create gRPC client for communicating with the server + triton_client = grpcclient.InferenceServerClient( + url=FLAGS.url, verbose=FLAGS.verbose + ) + else: + triton_client = httpclient.InferenceServerClient( + url=FLAGS.url, verbose=FLAGS.verbose, concurrency=1 + ) + except Exception as e: + print("client creation failed: " + str(e)) + sys.exit(1) + + # Make sure the model matches our requirements, and get some + # properties of the model that we need for preprocessing + try: + model_metadata = triton_client.get_model_metadata( + model_name=FLAGS.model_name, model_version=FLAGS.model_version + ) + except InferenceServerException as e: + print("failed to retrieve the metadata: " + str(e)) + sys.exit(1) + + # Make sure the model matches our requirements, and get some + # properties of the model that we need for preprocessing + try: + model_metadata = triton_client.get_model_metadata( + model_name=FLAGS.model_name, model_version=FLAGS.model_version + ) + except InferenceServerException as e: + print("failed to retrieve the metadata: " + str(e)) + sys.exit(1) + + try: + model_config = triton_client.get_model_config( + model_name=FLAGS.model_name, model_version=FLAGS.model_version + ) + except InferenceServerException as e: + print("failed to retrieve the config: " + str(e)) + sys.exit(1) + + if FLAGS.protocol.lower() == "grpc": + max_batch_size, input_name, output_name, dtype = parse_model_grpc( + model_metadata, model_config.config + ) + else: + max_batch_size, input_name, output_name, dtype = parse_model_http( + model_metadata, model_config + ) + + input_data = np.zeros( + [FLAGS.batch_size, FLAGS.shape], dtype=triton_to_np_dtype(dtype) + ) + + # --------------------------- Warm-Up -------------------------------------------------------- + for i in range(FLAGS.warmup_count): + inputs, outputs = requestGenerator( + input_name, input_data, output_name, dtype, FLAGS.protocol.lower() + ) + triton_client.infer( + FLAGS.model_name, inputs, model_version=FLAGS.model_version, outputs=outputs + ) + + latencies = [] + + # --------------------------- Start Load -------------------------------------------------------- + + start_time = time.time() + + for i in range(FLAGS.iteration_count): + t0 = time.time() + inputs, outputs = requestGenerator( + input_name, input_data, output_name, dtype, FLAGS.protocol.lower() + ) + triton_client.infer( + FLAGS.model_name, inputs, model_version=FLAGS.model_version, outputs=outputs + ) + latencies.append(time.time() - t0) + + end_time = time.time() + + throughput = FLAGS.iteration_count / (end_time - start_time) + average_latency = np.average(latencies) * 1000 + p50_latency = np.percentile(latencies, 50) * 1000 + p90_latency = np.percentile(latencies, 90) * 1000 + p95_latency = np.percentile(latencies, 95) * 1000 + p99_latency = np.percentile(latencies, 99) * 1000 + + # --------------------------- Print Report ----------------------------------------------------- + print("Throughput: {} infer/sec".format(throughput)) + print("Latencies:") + print("\tAvg: {} ms".format(average_latency)) + print("\tp50: {} ms".format(p50_latency)) + print("\tp90: {} ms".format(p90_latency)) + print("\tp95: {} ms".format(p95_latency)) + print("\tp99: {} ms".format(p99_latency)) + + # --------------------------- Write CSV -------------------------------------------------------- + if FLAGS.csv != None: + file = open(FLAGS.csv, "w") + file.write( + "Concurrency,Inferences/Second,p50 latency,p90 latency,p95 latency,p99 latency\n" + ) + file.write( + "1,{},{},{},{},{}".format( + throughput, + p50_latency * 1000, + p90_latency * 1000, + p95_latency * 1000, + p99_latency * 1000, + ) + ) + file.close() diff --git a/qa/L0_perf_pyclients/test.sh b/qa/L0_perf_pyclients/test.sh new file mode 100755 index 0000000000..9b7e405977 --- /dev/null +++ b/qa/L0_perf_pyclients/test.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +REPORTER=../common/reporter.py +CLIENT_LOG="./simple_perf_client.log" +SIMPLE_PERF_CLIENT=simple_perf_client.py + +TF_VERSION=${TF_VERSION:=2} + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/custom_models --backend-config=tensorflow,version=${TF_VERSION}" +source ../common/util.sh + +# Select the single GPU that will be available to the inference +# server. +export CUDA_VISIBLE_DEVICES=0 +PROTOCOLS="grpc http" + +rm -f *.log *.csv *.tjson *.json + +RET=0 + +MODEL_NAME="custom_zero_1_int32" + +for PROTOCOL in $PROTOCOLS; do + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + + NAME=${MODEL_NAME}_${PROTOCOL} + EXTRA_ARGS="" && [[ "${PROTOCOL}" == "grpc" ]] && EXTRA_ARGS="-i grpc -u localhost:8001" + python $SIMPLE_PERF_CLIENT -m $MODEL_NAME --shape 100000 --csv ${NAME}.csv ${EXTRA_ARGS}>> ${NAME}.log 2>&1 + if (( $? != 0 )); then + RET=1 + fi + + echo -e "[{\"s_benchmark_kind\":\"benchmark_perf\"," >> ${NAME}.tjson + echo -e "\"s_benchmark_name\":\"python_client\"," >> ${NAME}.tjson + echo -e "\"s_server\":\"triton\"," >> ${NAME}.tjson + echo -e "\"s_protocol\":\"${PROTOCOL}\"," >> ${NAME}.tjson + echo -e "\"s_framework\":\"custom\"," >> ${NAME}.tjson + echo -e "\"s_model\":\"${MODEL_NAME}\"," >> ${NAME}.tjson + echo -e "\"l_concurrency\":1," >> ${NAME}.tjson + echo -e "\"l_batch_size\":1," >> ${NAME}.tjson + echo -e "\"l_instance_count\":1}]" >> ${NAME}.tjson + + + if [ -f $REPORTER ]; then + set +e + + URL_FLAG= + if [ ! -z ${BENCHMARK_REPORTER_URL} ]; then + URL_FLAG="-u ${BENCHMARK_REPORTER_URL}" + fi + + python $REPORTER -v -o ${NAME}.json --csv ${NAME}.csv ${URL_FLAG} ${NAME}.tjson + if (( $? != 0 )); then + RET=1 + fi + + set -e + fi + + kill $SERVER_PID + wait $SERVER_PID +done + +if (( $RET == 0 )); then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_resnet/run_test.sh b/qa/L0_perf_resnet/run_test.sh new file mode 100755 index 0000000000..579d00c0e5 --- /dev/null +++ b/qa/L0_perf_resnet/run_test.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +STATIC_BATCH=${STATIC_BATCH:=1} +INSTANCE_CNT=${INSTANCE_CNT:=1} +BACKEND_CONFIG=${BACKEND_CONFIG:=""} +TF_VERSION=${TF_VERSION:=2} + +REPORTER=../common/reporter.py + +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +MODEL_REPO="${PWD}/models" +SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR} ${BACKEND_CONFIG} --backend-config=tensorflow,version=${TF_VERSION}" +source ../common/util.sh + +# Select the single GPU that will be available to the inference +# server. Or use "export CUDA_VISIBLE_DEVICE=" to run on CPU. +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +MAX_BATCH=${STATIC_BATCH} +NAME=${MODEL_NAME}_sbatch${STATIC_BATCH}_instance${INSTANCE_CNT}_${PERF_CLIENT_PROTOCOL} + +rm -fr models && mkdir -p models && \ + cp -r $MODEL_PATH models/. && \ + (cd models/$MODEL_NAME && \ + sed -i "s/^max_batch_size:.*/max_batch_size: ${MAX_BATCH}/" config.pbtxt && \ + echo "instance_group [ { count: ${INSTANCE_CNT} }]") + +MEASUREMENT_WINDOW=5000 +PERF_CLIENT=../clients/perf_client +# Onnx and onnx-trt models are very slow on Jetson. +if [ "$ARCH" == "aarch64" ]; then + if [ "$MODEL_FRAMEWORK" == "onnx" ] || [ "$MODEL_FRAMEWORK" == "onnx_trt" ]; then + MEASUREMENT_WINDOW=20000 + fi +fi + +# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and +# reporting structure, though "triton_c_api" is not strictly a "protocol". +if [[ "${PERF_CLIENT_PROTOCOL}" == "triton_c_api" ]]; then + # Server will be run in-process with C API + SERVICE_ARGS="--service-kind triton_c_api \ + --triton-server-directory ${TRITON_DIR} \ + --model-repository ${MODEL_REPO}" +else + SERVICE_ARGS="-i ${PERF_CLIENT_PROTOCOL}" + + SERVER_LOG="${NAME}.server.log" + run_server + if (( $SERVER_PID == 0 )); then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + # Run the model once to warm up. Some frameworks do optimization on the first requests. + # Must warmup similar to actual run so that all instances are ready + # Note: Running extra PA for warmup doesn't make sense for C API since it + # uses in-process tritonserver which will exit along with this PA process. + set +e + $PERF_CLIENT -v -m $MODEL_NAME -p${MEASUREMENT_WINDOW} \ + -b${STATIC_BATCH} --concurrency-range ${CONCURRENCY} \ + ${SERVICE_ARGS} + set -e +fi + +set +e +set -o pipefail +PA_MAX_TRIALS=${PA_MAX_TRIALS:-"50"} +# Measure perf client results and write them to a file for reporting +$PERF_CLIENT -v -m $MODEL_NAME -p${MEASUREMENT_WINDOW} \ + -b${STATIC_BATCH} --concurrency-range ${CONCURRENCY} \ + --max-trials "${PA_MAX_TRIALS}" \ + ${SERVICE_ARGS} \ + -f ${NAME}.csv 2>&1 | tee ${NAME}.log +if (( $? != 0 )); then + echo -e "\n***\n*** FAILED Perf Analyzer measurement\n***" + RET=1 +fi +set +o pipefail +set -e + +echo -e "[{\"s_benchmark_kind\":\"benchmark_perf\"," >> ${NAME}.tjson +echo -e "\"s_benchmark_name\":\"resnet50\"," >> ${NAME}.tjson +echo -e "\"s_server\":\"triton\"," >> ${NAME}.tjson +echo -e "\"s_protocol\":\"${PERF_CLIENT_PROTOCOL}\"," >> ${NAME}.tjson +echo -e "\"s_framework\":\"${MODEL_FRAMEWORK}\"," >> ${NAME}.tjson +echo -e "\"s_model\":\"${MODEL_NAME}\"," >> ${NAME}.tjson +echo -e "\"l_concurrency\":${CONCURRENCY}," >> ${NAME}.tjson +echo -e "\"l_batch_size\":${STATIC_BATCH}," >> ${NAME}.tjson +echo -e "\"l_instance_count\":${INSTANCE_CNT}," >> ${NAME}.tjson +echo -e "\"s_architecture\":\"${ARCH}\"}]" >> ${NAME}.tjson + +# SERVER_PID may not be set if using "triton_c_api" for example +if [[ -n "${SERVER_PID}" ]]; then + kill $SERVER_PID + wait $SERVER_PID +fi + +if [ -f $REPORTER ]; then + set +e + + URL_FLAG= + if [ ! -z ${BENCHMARK_REPORTER_URL} ]; then + URL_FLAG="-u ${BENCHMARK_REPORTER_URL}" + fi + + $REPORTER -v -o ${NAME}.json --csv ${NAME}.csv ${URL_FLAG} ${NAME}.tjson + if (( $? != 0 )); then + RET=1 + fi + + set -e +fi + +if (( $RET == 0 )); then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_resnet/test.sh b/qa/L0_perf_resnet/test.sh new file mode 100755 index 0000000000..35d5b174be --- /dev/null +++ b/qa/L0_perf_resnet/test.sh @@ -0,0 +1,239 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +rm -f *.log *.csv *.tjson *.json + +PROTOCOLS="grpc http triton_c_api" + +TRT_MODEL_NAME="resnet50_fp32_plan" +TF_MODEL_NAME="resnet50v1.5_fp16_savedmodel" +PYT_MODEL_NAME="resnet50_fp32_libtorch" +ONNX_MODEL_NAME="resnet50_fp32_onnx" + +# The base model name should be the prefix to the +# respective optimized model name. +TFTRT_MODEL_NAME="resnet50v1.5_fp16_savedmodel_trt" +ONNXTRT_MODEL_NAME="resnet50_fp32_onnx_trt" +TFAMP_MODEL_NAME="resnet50v1.5_fp16_savedmodel_amp" + +ARCH=${ARCH:="x86_64"} +REPODIR=${REPODIR:="/data/inferenceserver/${REPO_VERSION}"} +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +TRTEXEC=/usr/src/tensorrt/bin/trtexec +CACHE_PATH=`pwd`/trt_cache + + +# +# Test minimum latency +# +STATIC_BATCH=1 +INSTANCE_CNT=1 +CONCURRENCY=1 + +MODEL_NAMES="${TRT_MODEL_NAME} ${TF_MODEL_NAME} ${ONNX_MODEL_NAME} ${PYT_MODEL_NAME}" + +# Disable TF-TRT test on Jetson due to Segfault +# Disable ORT-TRT test on Jetson due to support being disabled +if [ "$ARCH" == "aarch64" ]; then + OPTIMIZED_MODEL_NAMES="${TFAMP_MODEL_NAME}" +else + OPTIMIZED_MODEL_NAMES="${TFTRT_MODEL_NAME} ${TFAMP_MODEL_NAME} ${ONNXTRT_MODEL_NAME}" +fi + +# Create optimized models +rm -fr optimized_model_store && mkdir optimized_model_store +for MODEL_NAME in $OPTIMIZED_MODEL_NAMES; do + BASE_MODEL=$(echo ${MODEL_NAME} | cut -d '_' -f 1,2,3) + cp -r $REPODIR/perf_model_store/${BASE_MODEL} optimized_model_store/${MODEL_NAME} + CONFIG_PATH="optimized_model_store/${MODEL_NAME}/config.pbtxt" + sed -i "s/^name: \"${BASE_MODEL}\"/name: \"${MODEL_NAME}\"/" ${CONFIG_PATH} + echo "optimization { execution_accelerators {" >> ${CONFIG_PATH} + echo "gpu_execution_accelerator : [ {" >> ${CONFIG_PATH} + if [ "${MODEL_NAME}" = "${TFAMP_MODEL_NAME}" ] ; then + echo "name : \"auto_mixed_precision\" " >> ${CONFIG_PATH} + else + echo "name : \"tensorrt\" " >> ${CONFIG_PATH} + if [ "${MODEL_NAME}" = "${TFTRT_MODEL_NAME}" ] ; then + echo "parameters { key: \"precision_mode\" value: \"FP16\" }" >> ${CONFIG_PATH} + fi + + if [ "${MODEL_NAME}" = "${ONNXTRT_MODEL_NAME}" ] ; then + echo "parameters { key: \"precision_mode\" value: \"FP16\" }" >> ${CONFIG_PATH} + echo "parameters { key: \"max_workspace_size_bytes\" value: \"1073741824\" }" >> ${CONFIG_PATH} + echo "parameters { key: \"trt_engine_cache_enable\" value: \"1\" }" >> ${CONFIG_PATH} + echo "parameters { key: \"trt_engine_cache_path\" value: \"${CACHE_PATH}\" } " >> ${CONFIG_PATH} + fi + fi + echo "} ]" >> ${CONFIG_PATH} + echo "}}" >> ${CONFIG_PATH} +done + +# Create the TensorRT plan from ONNX model +rm -fr tensorrt_models && mkdir -p tensorrt_models/$TRT_MODEL_NAME/1 && \ +cp $REPODIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/1/model.onnx tensorrt_models/$TRT_MODEL_NAME/ && \ +cp $REPODIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/labels.txt tensorrt_models/$TRT_MODEL_NAME/ && \ +cp $REPODIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/config.pbtxt tensorrt_models/$TRT_MODEL_NAME/ + +# Build TRT engine +$TRTEXEC --onnx=tensorrt_models/$TRT_MODEL_NAME/model.onnx --saveEngine=tensorrt_models/$TRT_MODEL_NAME/1/model.plan \ + --minShapes=input:1x3x224x224 --optShapes=input:${STATIC_BATCH}x3x224x224 \ + --maxShapes=input:${STATIC_BATCH}x3x224x224 + +rm tensorrt_models/$TRT_MODEL_NAME/model.onnx +sed -i "s/^name: .*/name: \"$TRT_MODEL_NAME\"/g" tensorrt_models/$TRT_MODEL_NAME/config.pbtxt && \ +sed -i 's/^platform: .*/platform: "tensorrt_plan"/g' tensorrt_models/$TRT_MODEL_NAME/config.pbtxt + +# Tests with each "non-optimized" model +for MODEL_NAME in $MODEL_NAMES; do + for PROTOCOL in $PROTOCOLS; do + REPO=`pwd`/tensorrt_models && [ "$MODEL_NAME" != "$TRT_MODEL_NAME" ] && \ + REPO=$REPODIR/perf_model_store + FRAMEWORK=$(echo ${MODEL_NAME} | cut -d '_' -f 3) + MODEL_NAME=${MODEL_NAME} \ + MODEL_FRAMEWORK=${FRAMEWORK} \ + MODEL_PATH="$REPO/${MODEL_NAME}" \ + STATIC_BATCH=${STATIC_BATCH} \ + PERF_CLIENT_PROTOCOL=${PROTOCOL} \ + INSTANCE_CNT=${INSTANCE_CNT} \ + CONCURRENCY=${CONCURRENCY} \ + ARCH=${ARCH} \ + bash -x run_test.sh + done +done + +# Tests with optimization enabled models +for MODEL_NAME in $OPTIMIZED_MODEL_NAMES; do + for PROTOCOL in $PROTOCOLS; do + REPO=`pwd`/optimized_model_store + FRAMEWORK=$(echo ${MODEL_NAME} | cut -d '_' -f 3,4) + MODEL_NAME=${MODEL_NAME} \ + MODEL_FRAMEWORK=${FRAMEWORK} \ + MODEL_PATH="$REPO/${MODEL_NAME}" \ + STATIC_BATCH=${STATIC_BATCH} \ + PERF_CLIENT_PROTOCOL=${PROTOCOL} \ + INSTANCE_CNT=${INSTANCE_CNT} \ + CONCURRENCY=${CONCURRENCY} \ + ARCH=${ARCH} \ + bash -x run_test.sh + done +done + +# +# Test large static batch = 128 w/ 2 instances (Use batch size 64 on Jetson Xavier) +# +if [ "$ARCH" == "aarch64" ]; then + STATIC_BATCH=64 +else + STATIC_BATCH=128 +fi + +INSTANCE_CNT=2 +CONCURRENCY=4 + +# Create the TensorRT plan from ONNX model +rm -fr tensorrt_models && mkdir -p tensorrt_models/$TRT_MODEL_NAME/1 && \ +cp $REPODIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/1/model.onnx tensorrt_models/$TRT_MODEL_NAME/ && \ +cp $REPODIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/labels.txt tensorrt_models/$TRT_MODEL_NAME/ && \ +cp $REPODIR/qa_dynamic_batch_image_model_repository/resnet50_onnx/config.pbtxt tensorrt_models/$TRT_MODEL_NAME/ + +# Build TRT engine +$TRTEXEC --onnx=tensorrt_models/$TRT_MODEL_NAME/model.onnx --saveEngine=tensorrt_models/$TRT_MODEL_NAME/1/model.plan \ + --minShapes=input:1x3x224x224 --optShapes=input:${STATIC_BATCH}x3x224x224 \ + --maxShapes=input:${STATIC_BATCH}x3x224x224 + +rm tensorrt_models/$TRT_MODEL_NAME/model.onnx +sed -i "s/^name: .*/name: \"$TRT_MODEL_NAME\"/g" tensorrt_models/$TRT_MODEL_NAME/config.pbtxt && \ +sed -i 's/^platform: .*/platform: "tensorrt_plan"/g' tensorrt_models/$TRT_MODEL_NAME/config.pbtxt + +for MODEL_NAME in $MODEL_NAMES; do + for PROTOCOL in $PROTOCOLS; do + REPO=`pwd`/tensorrt_models && [ "$MODEL_NAME" != "$TRT_MODEL_NAME" ] && \ + REPO=$REPODIR/perf_model_store + FRAMEWORK=$(echo ${MODEL_NAME} | cut -d '_' -f 3) + MODEL_NAME=${MODEL_NAME} \ + MODEL_FRAMEWORK=${FRAMEWORK} \ + MODEL_PATH="$REPO/${MODEL_NAME}" \ + STATIC_BATCH=${STATIC_BATCH} \ + PERF_CLIENT_PROTOCOL=${PROTOCOL} \ + INSTANCE_CNT=${INSTANCE_CNT} \ + CONCURRENCY=${CONCURRENCY} \ + ARCH=${ARCH} \ + bash -x run_test.sh + done +done + +for MODEL_NAME in $OPTIMIZED_MODEL_NAMES; do + for PROTOCOL in $PROTOCOLS; do + REPO=`pwd`/optimized_model_store + FRAMEWORK=$(echo ${MODEL_NAME} | cut -d '_' -f 3,4) + MODEL_NAME=${MODEL_NAME} \ + MODEL_FRAMEWORK=${FRAMEWORK} \ + MODEL_PATH="$REPO/${MODEL_NAME}" \ + STATIC_BATCH=${STATIC_BATCH} \ + PERF_CLIENT_PROTOCOL=${PROTOCOL} \ + INSTANCE_CNT=${INSTANCE_CNT} \ + CONCURRENCY=${CONCURRENCY} \ + ARCH=${ARCH} \ + bash -x run_test.sh + done +done + +# FIXME Disable the following due to +# https://jirasw.nvidia.com/browse/DLIS-2933. +# +# Needs this additional test configuration for comparing against TFS. +if [ "$ARCH" == "x86_64" ]; then + MODEL_NAME=${TF_MODEL_NAME} + REPO=$REPODIR/perf_model_store + STATIC_BATCH=128 + INSTANCE_CNT=1 + CONCURRENCY=1 + FRAMEWORK=$(echo ${MODEL_NAME} | cut -d '_' -f 3) + MODEL_NAME=${MODEL_NAME} \ + MODEL_FRAMEWORK=${FRAMEWORK} \ + MODEL_PATH="$REPO/${MODEL_NAME}" \ + STATIC_BATCH=${STATIC_BATCH} \ + PERF_CLIENT_PROTOCOL="grpc" \ + INSTANCE_CNT=${INSTANCE_CNT} \ + CONCURRENCY=${CONCURRENCY} \ + ARCH=${ARCH} \ + BACKEND_CONFIG=" --backend-config=tensorflow,version=2" \ + bash -x run_test.sh +fi diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh new file mode 100755 index 0000000000..e74b01e568 --- /dev/null +++ b/qa/L0_perf_tensorrt_llm/test.sh @@ -0,0 +1,300 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +RET=0 +BASE_DIR=$(pwd) +NUM_GPUS=${NUM_GPUS:=1} +TENSORRTLLM_BACKEND_REPO_TAG=${TENSORRTLLM_BACKEND_REPO_TAG:="main"} +TRT_ROOT="/usr/local/tensorrt" + +MODEL_NAME="gpt2_tensorrt_llm" +NAME="tensorrt_llm_benchmarking_test" +MODEL_REPOSITORY="$(pwd)/triton_model_repo" +TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend" +GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt" +TOKENIZER_DIR="$GPT_DIR/gpt2" +ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu" +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +SERVER_LOG="${NAME}_server.log" +SERVER_TIMEOUT=${SERVER_TIMEOUT:=120} + +function clone_tensorrt_llm_backend_repo { + rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR + apt-get update && apt-get install git-lfs -y --no-install-recommends + git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR + cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive +} + +# Update Open MPI to a version compatible with SLURM. +function upgrade_openmpi { + local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}') + + if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then + # Uninstall the current version of Open MPI + rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || { + echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION." + exit 1 + } + else + echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade." + return + fi + + # Install SLURM supported Open MPI version + cd /tmp/ + wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || { + echo "Failed to download Open MPI 5.0.1" + exit 1 + } + rm -rf openmpi-5.0.1 && tar -xzf openmpi-5.0.1.tar.gz && cd openmpi-5.0.1 || { + echo "Failed to extract Open MPI 5.0.1" + exit 1 + } + ./configure --prefix=/opt/hpcx/ompi/ && make && make install || { + echo "Failed to install Open MPI 5.0.1" + exit 1 + } + + # Update environment variables + if ! grep -q '/opt/hpcx/ompi/bin' ~/.bashrc; then + echo 'export PATH=/opt/hpcx/ompi/bin:$PATH' >>~/.bashrc + fi + + if ! grep -q '/opt/hpcx/ompi/lib' ~/.bashrc; then + echo 'export LD_LIBRARY_PATH=/opt/hpcx/ompi/lib:$LD_LIBRARY_PATH' >>~/.bashrc + fi + ldconfig + source ~/.bashrc + cd "$BASE_DIR" + mpirun --version +} + +function build_gpt2_base_model { + # Download weights from HuggingFace Transformers + cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 + rm pytorch_model.bin model.safetensors + if ! wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin; then + echo "Downloading pytorch_model.bin failed." + exit 1 + fi + cd ${GPT_DIR} + + # Convert weights from HF Tranformers to FT format + python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" + cd ${BASE_DIR} +} + +function build_gpt2_tensorrt_engine { + # Build TensorRT engines + cd ${GPT_DIR} + trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --paged_kv_cache enable \ + --gemm_plugin float16 \ + --workers "${NUM_GPUS}" \ + --output_dir "${ENGINES_DIR}" + + cd ${BASE_DIR} +} + +function replace_config_tags { + tag_to_replace="${1}" + new_value="${2}" + config_file_path="${3}" + sed -i "s|${tag_to_replace}|${new_value}|g" ${config_file_path} +} + +function prepare_model_repository { + rm -rf ${MODEL_REPOSITORY} && mkdir ${MODEL_REPOSITORY} + cp -r ${TENSORRTLLM_BACKEND_DIR}/all_models/inflight_batcher_llm/* ${MODEL_REPOSITORY} + rm -rf ${MODEL_REPOSITORY}/tensorrt_llm_bls + mv "${MODEL_REPOSITORY}/ensemble" "${MODEL_REPOSITORY}/${MODEL_NAME}" + + replace_config_tags "model_version: -1" "model_version: 1" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" + replace_config_tags 'name: "ensemble"' "name: \"$MODEL_NAME\"" "${MODEL_REPOSITORY}/${MODEL_NAME}/config.pbtxt" + + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" + + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" + replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" + replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" + + replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" +} + +# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on +# success, 1 on failure +function wait_for_server_ready() { + local wait_time_secs="${1:-30}" + shift + local spids=("$@") + + WAIT_RET=0 + + for _ in $(seq "$wait_time_secs"); do + for pid in "${spids[@]}"; do + if ! kill -0 "$pid" >/dev/null 2>&1; then + echo "=== Server not running." + WAIT_RET=1 + return + fi + done + + sleep 1 + + if curl -s --fail localhost:8000/v2/health/ready && + curl -s --fail -w "%{http_code}" -o /dev/null -d '{"log_verbose_level":1}' localhost:8000/v2/logging; then + return + fi + done + + echo "=== Timeout $wait_time_secs secs. Server not ready." + WAIT_RET=1 +} + +function run_server { + python3 ${TENSORRTLLM_BACKEND_DIR}/scripts/launch_triton_server.py --world_size="${NUM_GPUS}" --model_repo="${MODEL_REPOSITORY}" >${SERVER_LOG} 2>&1 & + sleep 2 # allow time to obtain the pid(s) + # Read PIDs into an array, trimming whitespaces + readarray -t SERVER_PID < <(pgrep "tritonserver") + + wait_for_server_ready ${SERVER_TIMEOUT} "${SERVER_PID[@]}" + if [ "$WAIT_RET" != "0" ]; then + # Cleanup + kill "${SERVER_PID[@]}" >/dev/null 2>&1 || true + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi +} + +function kill_server { + pgrep tritonserver | xargs kill -SIGINT + for pid in "${SERVER_PID[@]}"; do + echo "Waiting for proc ${pid} to terminate..." + while kill -0 $pid >/dev/null 2>&1; do + sleep 1 + done + done +} + +upgrade_openmpi +clone_tensorrt_llm_backend_repo +build_gpt2_base_model +build_gpt2_tensorrt_engine +prepare_model_repository + +# Install perf_analyzer +pip3 install tritonclient + +ARCH="amd64" +STATIC_BATCH=1 +INSTANCE_CNT=1 +CONCURRENCY=100 +MODEL_FRAMEWORK="tensorrt-llm" +PERF_CLIENT="perf_analyzer" +REPORTER=../common/reporter.py +INPUT_DATA="./input_data.json" +PERF_CLIENT_PROTOCOL="grpc" +EXPORT_FILE=profile-export-tensorrt-llm-model.json +rm -rf *.tjson *.json *.csv *log + +echo '{ + "data": [ + { + "text_input": ["Hello, my name is"], + "stream": [true], + "max_tokens": [16], + "bad_words": [""], + "stop_words": [""] + } + ] +}' >$INPUT_DATA + +# Set stability-percentage 999 to bypass the stability check in PA. +# LLM generates a sequence of tokens that is unlikely to be within a reasonable bound to determine valid measurement in terms of latency. +# Using "count_windows" measurement mode, which automatically extends the window for collecting responses. +PERF_CLIENT_ARGS="-v -m $MODEL_NAME -i $PERF_CLIENT_PROTOCOL --async --streaming --input-data=$INPUT_DATA --profile-export-file=$EXPORT_FILE \ + --shape=text_input:1 --shape=max_tokens:1 --shape=bad_words:1 --shape=stop_words:1 --measurement-mode=count_windows \ + --concurrency-range=$CONCURRENCY --measurement-request-count=10 --stability-percentage=999" + +set +e +run_server + +$PERF_CLIENT $PERF_CLIENT_ARGS -f ${NAME}.csv 2>&1 | tee ${NAME}_perf_analyzer.log +set +o pipefail + +kill_server +set -e + +echo -e "[{\"s_benchmark_kind\":\"benchmark_perf\"," >>${NAME}.tjson +echo -e "\"s_benchmark_repo_branch\":\"${BENCHMARK_REPO_BRANCH}\"," >>${NAME}.tjson +echo -e "\"s_benchmark_name\":\"${NAME}\"," >>${NAME}.tjson +echo -e "\"s_server\":\"triton\"," >>${NAME}.tjson +echo -e "\"s_protocol\":\"${PERF_CLIENT_PROTOCOL}\"," >>${NAME}.tjson +echo -e "\"s_framework\":\"${MODEL_FRAMEWORK}\"," >>${NAME}.tjson +echo -e "\"s_model\":\"${MODEL_NAME}\"," >>${NAME}.tjson +echo -e "\"l_concurrency\":${CONCURRENCY}," >>${NAME}.tjson +echo -e "\"l_batch_size\":${STATIC_BATCH}," >>${NAME}.tjson +echo -e "\"l_instance_count\":${INSTANCE_CNT}," >>${NAME}.tjson +echo -e "\"s_architecture\":\"${ARCH}\"}]" >>${NAME}.tjson + +if [ -f $REPORTER ]; then + set +e + + URL_FLAG= + if [ ! -z ${BENCHMARK_REPORTER_URL} ]; then + URL_FLAG="-u ${BENCHMARK_REPORTER_URL}" + fi + + python3 $REPORTER -v -e ${EXPORT_FILE} -o ${NAME}.json --csv ${NAME}.csv --gpu-metrics --token-latency ${URL_FLAG} ${NAME}.tjson + if (($? != 0)); then + RET=1 + fi + + set -e +fi + +if (($RET == 0)); then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_perf_vllm/test.sh b/qa/L0_perf_vllm/test.sh new file mode 100755 index 0000000000..e1ce8cf2ed --- /dev/null +++ b/qa/L0_perf_vllm/test.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../common/util.sh + +REPORTER=../common/reporter.py +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +MODEL_REPO="${PWD}/models" +NAME="vllm_benchmarking_test" +MODEL_NAME="gpt2_vllm" +INPUT_DATA="./input_data.json" +SERVER_LOG="${NAME}_server.log" +SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR} --log-verbose=1" + +export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0} +EXPORT_FILE=profile-export-vllm-model.json + +pip3 install tritonclient +rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv + +mkdir -p $MODEL_REPO/$MODEL_NAME/1 +echo '{ + "model":"gpt2", + "disable_log_requests": "true", + "gpu_memory_utilization": 0.5 +}' >$MODEL_REPO/$MODEL_NAME/1/model.json + +echo 'backend: "vllm" +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +]' >$MODEL_REPO/$MODEL_NAME/config.pbtxt + +echo '{ + "data": [ + { + "text_input": [ + "hi hi hi hi hi hi hi hi hi hi" + ], + "stream": [ + true + ], + "sampling_parameters": [ + "{\"max_tokens\": 1024, \"ignore_eos\": true}" + ] + } + ] +}' >$INPUT_DATA + +RET=0 +ARCH="amd64" +STATIC_BATCH=1 +INSTANCE_CNT=1 +CONCURRENCY=100 +MODEL_FRAMEWORK="vllm" +PERF_CLIENT_PROTOCOL="grpc" +PERF_CLIENT=perf_analyzer + +# Set stability-percentage 999 to bypass the stability check in PA. +# LLM generates a sequence of tokens that is unlikely to be within a reasonable bound to determine valid measurement in terms of latency. +# Using "count_windows" measurement mode, which automatically extends the window for collecting responses. +PERF_CLIENT_ARGS="-v -m $MODEL_NAME --concurrency-range=${CONCURRENCY} --measurement-mode=count_windows --measurement-request-count=10 \ + --input-data=$INPUT_DATA --profile-export-file=$EXPORT_FILE -i $PERF_CLIENT_PROTOCOL --async --streaming --stability-percentage=999" + +run_server +if (($SERVER_PID == 0)); then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +$PERF_CLIENT $PERF_CLIENT_ARGS -f ${NAME}.csv 2>&1 | tee ${NAME}_perf_analyzer.log +set +o pipefail +set -e + +if [[ -n "${SERVER_PID}" ]]; then + kill $SERVER_PID + wait $SERVER_PID +fi + +echo -e "[{\"s_benchmark_kind\":\"benchmark_perf\"," >>${NAME}.tjson +echo -e "\"s_benchmark_repo_branch\":\"${BENCHMARK_REPO_BRANCH}\"," >>${NAME}.tjson +echo -e "\"s_benchmark_name\":\"${NAME}\"," >>${NAME}.tjson +echo -e "\"s_server\":\"triton\"," >>${NAME}.tjson +echo -e "\"s_protocol\":\"${PERF_CLIENT_PROTOCOL}\"," >>${NAME}.tjson +echo -e "\"s_framework\":\"${MODEL_FRAMEWORK}\"," >>${NAME}.tjson +echo -e "\"s_model\":\"${MODEL_NAME}\"," >>${NAME}.tjson +echo -e "\"l_concurrency\":\"${CONCURRENCY}\"," >>${NAME}.tjson +echo -e "\"l_batch_size\":${STATIC_BATCH}," >>${NAME}.tjson +echo -e "\"l_instance_count\":${INSTANCE_CNT}," >>${NAME}.tjson +echo -e "\"s_architecture\":\"${ARCH}\"}]" >>${NAME}.tjson + +if [ -f $REPORTER ]; then + set +e + + URL_FLAG= + if [ ! -z ${BENCHMARK_REPORTER_URL} ]; then + URL_FLAG="-u ${BENCHMARK_REPORTER_URL}" + fi + + python3 $REPORTER -v -e ${EXPORT_FILE} -o ${NAME}.json --csv ${NAME}.csv --gpu-metrics --token-latency ${URL_FLAG} ${NAME}.tjson + if (($? != 0)); then + RET=1 + fi + + set -e +fi + +rm -rf $MODEL_REPO $INPUT_DATA + +if (($RET == 0)); then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_pinned_memory/libtorch_ensemble.pbtxt b/qa/L0_pinned_memory/libtorch_ensemble.pbtxt new file mode 100644 index 0000000000..ee500fcab4 --- /dev/null +++ b/qa/L0_pinned_memory/libtorch_ensemble.pbtxt @@ -0,0 +1,70 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +name: "libtorch_ensemble" +platform: "ensemble" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "custom_zero_1_float32" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + output_map { + key: "OUTPUT0" + value: "temp_0" + } + }, + { + model_name: "libtorch_zero_1_float32" + model_version: -1 + input_map { + key: "INPUT__0" + value: "temp_0" + } + output_map { + key: "OUTPUT__0" + value: "OUTPUT0" + } + } + ] +} \ No newline at end of file diff --git a/qa/L0_pinned_memory/test.sh b/qa/L0_pinned_memory/test.sh new file mode 100755 index 0000000000..e38fef96a2 --- /dev/null +++ b/qa/L0_pinned_memory/test.sh @@ -0,0 +1,210 @@ +#!/bin/bash +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +CLIENT=../clients/perf_client +# Only use libtorch as it accepts GPU I/O and it can handle variable shape +BACKENDS=${BACKENDS:="libtorch"} + +DATADIR=/data/inferenceserver/${REPO_VERSION} + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +# Select the single GPU that will be available to the inference server +export CUDA_VISIBLE_DEVICES=0 + +rm -f *.log *.csv *.metrics +RET=0 + +rm -fr ./custom_models && mkdir ./custom_models && \ + cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \ + mkdir -p ./custom_models/custom_zero_1_float32/1 + +# +# Use "identity" model for all model types. +# +rm -fr models && mkdir -p models && \ + cp -r ./custom_models/custom_zero_1_float32 models/. && \ + (cd models/custom_zero_1_float32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + echo "instance_group [ { kind: KIND_CPU }]" >> config.pbtxt) + +for BACKEND in $BACKENDS; do + MODEL_NAME=${BACKEND}_zero_1_float32 + REPO_DIR=$DATADIR/qa_identity_model_repository + + cp -r $REPO_DIR/$MODEL_NAME models/. && \ + (cd models/$MODEL_NAME && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + echo "instance_group [ { kind: KIND_GPU }]" >> config.pbtxt) + + ENSEMBLE_NAME=${BACKEND}_ensemble + mkdir -p models/$ENSEMBLE_NAME/1 && \ + cp $ENSEMBLE_NAME.pbtxt models/$ENSEMBLE_NAME/config.pbtxt + + # With pinned memory + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" + SERVER_LOG="${ENSEMBLE_NAME}.pinned.server.log" + run_server + if (( $SERVER_PID == 0 )); then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + # Sanity check that the server allocates pinned memory for large size + set +e + $CLIENT -m${ENSEMBLE_NAME} --shape INPUT0:16777216 + if (( $? != 0 )); then + RET=1 + fi + + grep "non-pinned" ${ENSEMBLE_NAME}.pinned.server.log + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected only pinned memory is allocated\n***" + RET=1 + fi + + grep "] \"Pinned memory pool is created" ${ENSEMBLE_NAME}.pinned.server.log + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected pinned memory is allocated\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + + # Restart the server without verbose logging + SERVER_ARGS="--model-repository=`pwd`/models" + SERVER_LOG="${ENSEMBLE_NAME}.pinned.server.log" + run_server + if (( $SERVER_PID == 0 )); then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + # 16k 1m 2m 4m 8m 16m elements + set +e + for TENSOR_SIZE in 16384 1048576 2097152 4194304 8388608 16777216; do + $CLIENT -i grpc -u localhost:8001 -m${ENSEMBLE_NAME} \ + --shape INPUT0:${TENSOR_SIZE} \ + >> ${BACKEND}.${TENSOR_SIZE}.pinned.log 2>&1 + if (( $? != 0 )); then + RET=1 + fi + done + set -e + + kill $SERVER_PID + wait $SERVER_PID + + # Without pinned memory + SERVER_ARGS="--model-repository=`pwd`/models --pinned-memory-pool-byte-size=0 --log-verbose=1" + SERVER_LOG="${ENSEMBLE_NAME}.nonpinned.server.log" + run_server + if (( $SERVER_PID == 0 )); then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + # Sanity check that the server allocates non-pinned memory + set +e + $CLIENT -m${ENSEMBLE_NAME} --shape INPUT0:1 + if (( $? != 0 )); then + RET=1 + fi + + grep "] \"Pinned memory pool is created" ${ENSEMBLE_NAME}.nonpinned.server.log + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected only non-pinned memory is allocated\n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + # Restart the server without verbose logging + SERVER_ARGS="--model-repository=`pwd`/models --pinned-memory-pool-byte-size=0" + SERVER_LOG="${ENSEMBLE_NAME}.nonpinned.server.log" + run_server + if (( $SERVER_PID == 0 )); then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + # 16k 1m 2m 4m 8m 16m elements + set +e + for TENSOR_SIZE in 16384 1048576 2097152 4194304 8388608 16777216; do + $CLIENT -i grpc -u localhost:8001 -m${ENSEMBLE_NAME} \ + --shape INPUT0:${TENSOR_SIZE} \ + >> ${BACKEND}.${TENSOR_SIZE}.nonpinned.log 2>&1 + if (( $? != 0 )); then + RET=1 + fi + done + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +for BACKEND in $BACKENDS; do + for TENSOR_SIZE in 16384 1048576 2097152 4194304 8388608 16777216; do + echo -e "${BACKEND} ensemble ${TENSOR_SIZE} elements\n" + echo -e "non-pinned\n" + cat ${BACKEND}.${TENSOR_SIZE}.nonpinned.log + echo -e "pinned\n" + cat ${BACKEND}.${TENSOR_SIZE}.pinned.log + done +done + +if (( $RET == 0 )); then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_priority/test.sh b/qa/L0_priority/test.sh index 407dfdb9cc..6756c93f21 100755 --- a/qa/L0_priority/test.sh +++ b/qa/L0_priority/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,10 +25,25 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -DATADIR=/data/inferenceserver +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi -SERVER=/opt/tensorrtserver/bin/trtserver -SERVER_ARGS=--model-store=`pwd`/models +export CUDA_VISIBLE_DEVICES=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION} + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" SERVER_LOG="./inference_server.log" source ../common/util.sh @@ -69,9 +84,9 @@ if [ $? -ne 0 ]; then RET=1 fi -grep "plan_float32_float32_float32_max" $SERVER_LOG | grep "stream priority -1" +grep "plan_float32_float32_float32_max" $SERVER_LOG | grep "stream priority -5" if [ $? -ne 0 ]; then - echo -e "\n***\n*** Failed. Expected MAX priority -1\n***" + echo -e "\n***\n*** Failed. Expected MAX priority -5\n***" RET=1 fi @@ -89,6 +104,7 @@ wait $SERVER_PID if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else + cat $SERVER_LOG echo -e "\n***\n*** Test FAILED\n***" fi diff --git a/qa/L0_python_api/test.sh b/qa/L0_python_api/test.sh new file mode 100755 index 0000000000..0d87d16771 --- /dev/null +++ b/qa/L0_python_api/test.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +BINDING_TEST_LOG="./python_binding.log" + +RET=0 + +rm -f $BINDING_TEST_LOG + +set +e + +python -m pytest --junitxml=test_binding_report.xml test_binding.py > $BINDING_TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $BINDING_TEST_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +API_TEST_LOG="./python_api.log" + +python -m pytest --junitxml=test_api_report.xml test_api.py > $API_TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $API_TEST_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + + +FRONTEND_TEST_LOG="./python_kserve.log" +python -m pytest --junitxml=test_kserve.xml test_kserve.py > $FRONTEND_TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $FRONTEND_TEST_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_python_api/test_kserve.py b/qa/L0_python_api/test_kserve.py new file mode 100644 index 0000000000..021ce9be17 --- /dev/null +++ b/qa/L0_python_api/test_kserve.py @@ -0,0 +1,298 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +from functools import partial + +import numpy as np +import pytest +import testing_utils as utils +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +import tritonserver +from tritonclient.utils import InferenceServerException +from tritonfrontend import KServeGrpc, KServeHttp + + +class TestHttpOptions: + def test_correct_http_parameters(self): + KServeHttp.Options( + address="0.0.0.1", port=8080, reuse_port=True, thread_count=16 + ) + + def test_wrong_http_parameters(self): + # Out of range + with pytest.raises(Exception): + KServeHttp.Options(port=-15) + with pytest.raises(Exception): + KServeHttp.Options(thread_count=-5) + + # Wrong data type + with pytest.raises(Exception): + KServeHttp.Options(header_forward_pattern=10) + + +class TestGrpcOptions: + def test_correct_grpc_parameters(self): + KServeGrpc.Options( + infer_compression_level=KServeGrpc.Grpc_compression_level.HIGH, + reuse_port=True, + infer_allocation_pool_size=12, + http2_max_pings_without_data=10, + ) + + def test_wrong_grpc_parameters(self): + # Out of Range + with pytest.raises(Exception): + KServeGrpc.Options(port=-5) + with pytest.raises(Exception): + KServeGrpc.Options(keepalive_timeout_ms=-20_000) + + # Wrong data type + with pytest.raises(Exception): + KServeGrpc.Options(infer_allocation_pool_size="big pool") + with pytest.raises(Exception): + KServeGrpc.Options(server_key=10) + + +HTTP_ARGS = (KServeHttp, httpclient, "localhost:8000") # Default HTTP args +GRPC_ARGS = (KServeGrpc, grpcclient, "localhost:8001") # Default GRPC args + + +class TestKServe: + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) + def test_server_ready(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + client = utils.setup_client(client_type, url=url) + + assert client.is_server_ready() + + utils.teardown_client(client) + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_service_double_start(self, frontend): + server = utils.setup_server() + # setup_service() performs service.start() + service = utils.setup_service(server, frontend) + + with pytest.raises( + tritonserver.AlreadyExistsError, match="server is already running." + ): + service.start() + + utils.teardown_server(server) + utils.teardown_service(service) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_invalid_options(self, frontend): + server = utils.setup_server() + # Current flow is KServeHttp.Options or KServeGrpc.Options have to be + # provided to ensure type and range validation occurs. + with pytest.raises( + tritonserver.InvalidArgumentError, + match="Incorrect type for options. options argument must be of type", + ): + frontend.Server(server, {"port": 8001}) + + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend", [HTTP_ARGS[0], GRPC_ARGS[0]]) + def test_server_service_order(self, frontend): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + utils.teardown_server(server) + utils.teardown_service(service) + + @pytest.mark.parametrize("frontend, client_type", [HTTP_ARGS[:2], GRPC_ARGS[:2]]) + def test_service_custom_port(self, frontend, client_type): + server = utils.setup_server() + options = frontend.Options(port=8005) + service = utils.setup_service(server, frontend, options) + client = utils.setup_client(client_type, url="localhost:8005") + + # Confirms that service starts at port 8005 + client.is_server_ready() + + utils.teardown_client(client) + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS, GRPC_ARGS]) + def test_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + # TODO: use common/test_infer + assert utils.send_and_test_inference_identity(client_type, url=url) + + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS]) + def test_streaming_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + assert utils.send_and_test_stream_inference(client_type, url) + + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS]) + def test_http_generate_inference(self, frontend, client_type, url): + server = utils.setup_server() + service = utils.setup_service(server, frontend) + + assert utils.send_and_test_generate_inference() + + utils.teardown_service(service) + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [HTTP_ARGS]) + def test_http_req_during_shutdown(self, frontend, client_type, url): + server = utils.setup_server() + http_service = utils.setup_service(server, frontend) + http_client = httpclient.InferenceServerClient(url="localhost:8000") + model_name = "delayed_identity" + delay = 2 # seconds + input_data0 = np.array([[delay]], dtype=np.float32) + + input0 = httpclient.InferInput("INPUT0", input_data0.shape, "FP32") + input0.set_data_from_numpy(input_data0) + + inputs = [input0] + outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + + async_request = http_client.async_infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + # http_service.stop() does not use graceful shutdown + utils.teardown_service(http_service) + + # So, inference request will fail as http endpoints have been stopped. + with pytest.raises( + InferenceServerException, match="failed to obtain inference response" + ): + async_request.get_result(block=True, timeout=delay) + + # http_client.close() calls join() to terminate pool of greenlets + # However, due to an unsuccessful get_result(), async_request is still + # an active thread. Hence, join stalls until greenlet timeouts. + # Does not throw an exception, but displays error in logs. + utils.teardown_client(http_client) + + # delayed_identity will still be an active model + # Hence, server.stop() causes InternalError: Timeout. + with pytest.raises( + tritonserver.InternalError, + match="Exit timeout expired. Exiting immediately.", + ): + utils.teardown_server(server) + + @pytest.mark.parametrize("frontend, client_type, url", [GRPC_ARGS]) + def test_grpc_req_during_shutdown(self, frontend, client_type, url): + server = utils.setup_server() + grpc_service = utils.setup_service(server, frontend) + grpc_client = grpcclient.InferenceServerClient(url=url) + user_data = [] + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + model_name = "delayed_identity" + delay = 2 # seconds + + input_data0 = np.array([[delay]], dtype=np.float32) + input0 = client_type.InferInput("INPUT0", input_data0.shape, "FP32") + input0.set_data_from_numpy(input_data0) + + inputs = [input0] + outputs = [client_type.InferRequestedOutput("OUTPUT0")] + + grpc_client.async_infer( + model_name=model_name, + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + utils.teardown_service(grpc_service) + + time_out = delay + 1 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + + # Depending on when gRPC frontend shut down StatusCode can vary + acceptable_failure_msgs = [ + "[StatusCode.CANCELLED] CANCELLED", + "[StatusCode.UNAVAILABLE] failed to connect to all addresses", + ] + + assert ( + len(user_data) == 1 + and isinstance(user_data[0], InferenceServerException) + and any( + failure_msg in str(user_data[0]) + for failure_msg in acceptable_failure_msgs + ) + ) + + utils.teardown_client(grpc_client) + utils.teardown_server(server) + + # KNOWN ISSUE: CAUSES SEGFAULT + # Created [DLIS-7231] to address at future date + # Once the server has been stopped, the underlying TRITONSERVER_Server instance + # is deleted. However, the frontend does not know the server instance + # is no longer valid. + # def test_inference_after_server_stop(self): + # server = utils.setup_server() + # http_service = utils.setup_service(server, KServeHttp) + # http_client = setup_client(httpclient, url="localhost:8000") + + # teardown_server(server) # Server has been stopped + + # model_name = "identity" + # input_data = np.array([["testing"]], dtype=object) + # # Create input and output objects + # inputs = [httpclient.InferInput("INPUT0", input_data.shape, "BYTES")] + # outputs = [httpclient.InferRequestedOutput("OUTPUT0")] + + # # Set the data for the input tensor + # inputs[0].set_data_from_numpy(input_data) + + # results = http_client.infer(model_name, inputs=inputs, outputs=outputs) + + # utils.teardown_client(http_client) + # utils.teardown_service(http_service) diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py new file mode 100644 index 0000000000..b6095cec8f --- /dev/null +++ b/qa/L0_python_api/test_model_repository/delayed_identity/1/model.py @@ -0,0 +1,51 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Mock Model that uses the input data to determine how long to wait + before returning identity data + """ + assert len(requests) == 1 + delay = 0 + request = requests[0] + responses = [] + + delay_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + delay_as_numpy = delay_tensor.as_numpy() + delay = float(delay_as_numpy[0][0]) + + out_tensor = pb_utils.Tensor("OUTPUT0", delay_as_numpy) + responses.append(pb_utils.InferenceResponse([out_tensor])) + + time.sleep(delay) + return responses diff --git a/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt new file mode 100644 index 0000000000..9ac8f1aaff --- /dev/null +++ b/qa/L0_python_api/test_model_repository/delayed_identity/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "delayed_identity" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] \ No newline at end of file diff --git a/qa/L0_python_api/test_model_repository/identity/1/model.py b/qa/L0_python_api/test_model_repository/identity/1/model.py new file mode 100644 index 0000000000..629b6469c9 --- /dev/null +++ b/qa/L0_python_api/test_model_repository/identity/1/model.py @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model loops through different dtypes to make sure that + serialize_byte_tensor works correctly in the Python backend. + """ + + def initialize(self, args): + self._index = 0 + self._dtypes = [np.bytes_, np.object_] + + def execute(self, requests): + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", in_0.as_numpy().astype(self._dtypes[self._index]) + ) + self._index += 1 + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + return responses diff --git a/qa/L0_python_api/test_model_repository/identity/config.pbtxt b/qa/L0_python_api/test_model_repository/identity/config.pbtxt new file mode 100644 index 0000000000..3f22e14468 --- /dev/null +++ b/qa/L0_python_api/test_model_repository/identity/config.pbtxt @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] \ No newline at end of file diff --git a/qa/L0_python_api/testing_utils.py b/qa/L0_python_api/testing_utils.py new file mode 100644 index 0000000000..79901f7411 --- /dev/null +++ b/qa/L0_python_api/testing_utils.py @@ -0,0 +1,153 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import queue +from functools import partial +from typing import Union + +import numpy as np +import requests +import tritonserver +from tritonclient.utils import InferenceServerException +from tritonfrontend import KServeGrpc, KServeHttp + +# TODO: Re-Format documentation to fit: +# https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings + + +def setup_server(model_repository="test_model_repository") -> tritonserver.Server: + module_directory = os.path.split(os.path.abspath(__file__))[0] + model_path = os.path.abspath(os.path.join(module_directory, model_repository)) + + # Starting Server Instance + server_options = tritonserver.Options( + server_id="TestServer", + model_repository=model_path, + log_error=True, + log_warn=True, + log_info=True, + ) + + return tritonserver.Server(server_options).start(wait_until_ready=True) + + +def teardown_server(server: tritonserver.Server) -> None: + server.stop() + + +def setup_service( + server: tritonserver.Server, + frontend: Union[KServeHttp, KServeGrpc], + options=None, +) -> Union[KServeHttp, KServeGrpc]: + service = frontend.Server(server=server, options=options) + service.start() + return service + + +def teardown_service(service: Union[KServeHttp, KServeGrpc]) -> None: + service.stop() + + +def setup_client(frontend_client, url: str): + return frontend_client.InferenceServerClient(url=url) + + +def teardown_client(client) -> None: + client.close() + + +# Sends an inference to test_model_repository/identity model and verifies input == output. +def send_and_test_inference_identity(frontend_client, url: str) -> bool: + model_name = "identity" + client = setup_client(frontend_client, url) + input_data = np.array(["testing"], dtype=object) + + # Create input and output objects + inputs = [frontend_client.InferInput("INPUT0", input_data.shape, "BYTES")] + outputs = [frontend_client.InferRequestedOutput("OUTPUT0")] + # Set the data for the input tensor + inputs[0].set_data_from_numpy(input_data) + + # Perform inference request + results = client.infer(model_name=model_name, inputs=inputs, outputs=outputs) + + output_data = results.as_numpy("OUTPUT0") # Gather output data + + teardown_client(client) + return input_data[0] == output_data[0].decode() + + +# Sends multiple streaming requests to "delayed_identity" model with negligible delays, +# and verifies the inputs matches outputs and the ordering is preserved. +def send_and_test_stream_inference(frontend_client, url: str) -> bool: + num_requests = 100 + requests = [] + for i in range(num_requests): + input0_np = np.array([[float(i) / 1000]], dtype=np.float32) + inputs = [frontend_client.InferInput("INPUT0", input0_np.shape, "FP32")] + inputs[0].set_data_from_numpy(input0_np) + requests.append(inputs) + + responses = [] + + def callback(responses, result, error): + responses.append({"result": result, "error": error}) + + client = frontend_client.InferenceServerClient(url=url) + client.start_stream(partial(callback, responses)) + for inputs in requests: + client.async_stream_infer("delayed_identity", inputs) + client.stop_stream() + teardown_client(client) + + assert len(responses) == num_requests + for i in range(len(responses)): + assert responses[i]["error"] is None + output0_np = responses[i]["result"].as_numpy(name="OUTPUT0") + assert np.allclose(output0_np, [[float(i) / 1000]]) + + return True # test passed + + +def send_and_test_generate_inference() -> bool: + model_name = "identity" + url = f"http://localhost:8000/v2/models/{model_name}/generate" + input_text = "testing" + data = { + "INPUT0": input_text, + } + + response = requests.post(url, json=data, stream=True) + if response.status_code == 200: + result = response.json() + output_text = result.get("OUTPUT0", "") + + if output_text == input_text: + return True + + return False diff --git a/qa/L0_python_client_unit_tests/test.sh b/qa/L0_python_client_unit_tests/test.sh new file mode 100755 index 0000000000..5a46ecccc5 --- /dev/null +++ b/qa/L0_python_client_unit_tests/test.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TEST_LOG="./python_client_unit_tests.log" +PYTHON_CLIENT_UNIT_TESTS_DIR=/opt/tritonserver/qa/python_client_unit_tests/ +PYTHON_CLIENT_UNIT_TESTS_CMD="python3 -m unittest discover -v -s $PYTHON_CLIENT_UNIT_TESTS_DIR -t $PYTHON_CLIENT_UNIT_TESTS_DIR" + +# DLPack test requires Torch to validate GPU tensor +pip3 install torch + +RET=0 + +rm -f $TEST_LOG + +set +e + +$PYTHON_CLIENT_UNIT_TESTS_CMD > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_pytorch_python_runtime/infer.py b/qa/L0_pytorch_python_runtime/infer.py new file mode 100755 index 0000000000..aeda498710 --- /dev/null +++ b/qa/L0_pytorch_python_runtime/infer.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import concurrent.futures +import json +import sys + +import numpy as np +import tritonclient.http as httpclient +from tritonclient.utils import * + + +def infer_model_without_parameter_file(): + model_name = "addsub" + shape = [4] + + with httpclient.InferenceServerClient("localhost:8000") as client: + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + inputs = [ + httpclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + httpclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input1_data.dtype) + ), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0"), + httpclient.InferRequestedOutput("OUTPUT1"), + ] + + response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs) + + output0_data = response.as_numpy("OUTPUT0") + output1_data = response.as_numpy("OUTPUT1") + + print( + "INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output0_data + ) + ) + print( + "INPUT0 ({}) - INPUT1 ({}) = OUTPUT0 ({})".format( + input0_data, input1_data, output1_data + ) + ) + + if not np.allclose(input0_data + input1_data, output0_data): + print(model_name + " error: incorrect sum") + return False + + if not np.allclose(input0_data - input1_data, output1_data): + print(model_name + " error: incorrect difference") + return False + + print("PASS: " + model_name) + return True + + +def infer_model_with_parameter_file(batch_size, data_offset=0): + model_name = "neuralnet" + test_data_file = "neuralnet_test_data.json" + np_dtype = np.single + + # prepare input data + with open(test_data_file) as f: + test_data = json.load(f) + input_data = np.array(test_data["input_data"], dtype=np_dtype) + input_data = input_data[data_offset : (data_offset + batch_size)] + labels = test_data["labels"][data_offset : (data_offset + batch_size)] + + # inference + with httpclient.InferenceServerClient("localhost:8000") as client: + inputs = [ + httpclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + + response = client.infer(model_name, inputs, request_id=str(1)) + output_data = response.as_numpy("OUTPUT") + output_data_max = np.max(output_data, axis=1) + + print("Inference result: " + str(output_data)) + print("Inference result (max): " + str(output_data_max)) + print("Expected result: " + str(labels)) + + if not np.all(np.isclose(np.max(output_data, axis=1), labels, atol=8)): + print(model_name + " error: incorrect result") + return False + + print("PASS: " + model_name) + return True + + +def parallel_infer_a_full_dynamic_batch(max_batch_size): + batch_size = 1 + success = True + with concurrent.futures.ThreadPoolExecutor() as pool: + threads = [] + for i in range(max_batch_size // batch_size): + t = pool.submit(infer_model_with_parameter_file, batch_size, i) + threads.append(t) + for t in threads: + success &= t.result() + return success + + +if __name__ == "__main__": + success = infer_model_without_parameter_file() + success &= infer_model_with_parameter_file(batch_size=4) + success &= parallel_infer_a_full_dynamic_batch(max_batch_size=8) + if not success: + sys.exit(1) + sys.exit(0) diff --git a/qa/L0_pytorch_python_runtime/test.sh b/qa/L0_pytorch_python_runtime/test.sh new file mode 100755 index 0000000000..23ce022955 --- /dev/null +++ b/qa/L0_pytorch_python_runtime/test.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +DATA_DIR=/data/inferenceserver/${REPO_VERSION} +IMAGE_DIR="/opt/tritonserver/qa/images" +SERVER=/opt/tritonserver/bin/tritonserver +IMAGE_CLIENT="/opt/tritonserver/qa/clients/image_client.py" +BACKENDS="/opt/tritonserver/backends" +source ../common/util.sh + +if [ ! -f "$BACKENDS/pytorch/pb_exec_env_model.py.tar.gz" ]; then + PYTORCH_BACKEND_REPO_TAG=${PYTORCH_BACKEND_REPO_TAG:="main"} + rm -rf pytorch_backend + git clone --single-branch --depth=1 -b $PYTORCH_BACKEND_REPO_TAG https://github.com/triton-inference-server/pytorch_backend + (cd pytorch_backend/tools && \ + ./gen_pb_exec_env.sh && \ + mv pb_exec_env_model.py.tar.gz $BACKENDS/pytorch) +fi + +rm -f *.log +RET=0 + +# +# Unit tests +# +rm -rf py_runtime_exec_env py_runtime_exec_env.tar.gz py_runtime.py +cp $BACKENDS/pytorch/model.py py_runtime.py +cp $BACKENDS/pytorch/pb_exec_env_model.py.tar.gz py_runtime_exec_env.tar.gz +mkdir py_runtime_exec_env && tar -xzf py_runtime_exec_env.tar.gz -C py_runtime_exec_env + +set +e + +UNIT_TEST_ENV="source py_runtime_exec_env/bin/activate && exec env LD_LIBRARY_PATH=`pwd`/py_runtime_exec_env/lib:$LD_LIBRARY_PATH" +UNIT_TEST_LOG="./unit_test.log" +bash -c "$UNIT_TEST_ENV python3 unit_test.py" > $UNIT_TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed PyTorch Python backend based runtime unit test\n***" + cat $UNIT_TEST_LOG + RET=1 +fi + +set -e + +# +# End-to-end inference tests +# +rm -rf models && mkdir models +cp -r $DATA_DIR/pytorch_model_store/* models +cp -r $DATA_DIR/libtorch_model_store/resnet50_libtorch models && \ + sed -i "/platform/d" models/resnet50_libtorch/config.pbtxt && \ + echo "backend: \"pytorch\"" >> models/resnet50_libtorch/config.pbtxt && \ + echo "runtime: \"model.py\"" >> models/resnet50_libtorch/config.pbtxt && \ + echo "instance_group: [{ kind: KIND_MODEL }]" >> models/resnet50_libtorch/config.pbtxt +mv models/neuralnet/1/test_data.json neuralnet_test_data.json + +SERVER_ARGS="--model-repository=models --log-verbose=1" +SERVER_LOG="./infer.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 +else + set +e + + # Check correct model instance initialization + EXPECTED_LOG_MSGS=( + 'Loading '"'"'resnet50_libtorch'"'"' as TorchScript' + 'Torch parallelism settings for '"'"'addsub'"'"': NUM_THREADS = 1; NUM_INTEROP_THREADS = 1;' + 'Torch parallelism settings for '"'"'neuralnet'"'"': NUM_THREADS = 4; NUM_INTEROP_THREADS = 2;' + 'Torch parallelism settings for '"'"'resnet50_libtorch'"'"': NUM_THREADS = 1; NUM_INTEROP_THREADS = 1;' + ''"'"'torch.compile'"'"' optional parameter(s) for '"'"'addsub'"'"': {'"'"'disable'"'"': True}' + ''"'"'torch.compile'"'"' optional parameter(s) for '"'"'neuralnet'"'"': {}' + ''"'"'torch.compile'"'"' optional parameter(s) for '"'"'resnet50_libtorch'"'"': {}' + ) + for EXPECTED_LOG_MSG in "${EXPECTED_LOG_MSGS[@]}"; do + grep "$EXPECTED_LOG_MSG" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Cannot find \"$EXPECTED_LOG_MSG\" on server log. \n***" + cat $SERVER_LOG + RET=1 + fi + done + + # Infer TorchScript model + CLIENT_LOG="./infer.torchscript.log" + python $IMAGE_CLIENT -m "resnet50_libtorch" -s INCEPTION -c 1 -b 2 "$IMAGE_DIR/vulture.jpeg" > $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to inference TorchScript model\n***" + cat $CLIENT_LOG + RET=1 + fi + + # Infer PyTorch models + CLIENT_LOG="./infer.pytorch.log" + python infer.py > $CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed to inference PyTorch models\n***" + cat $CLIENT_LOG + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +fi + +# +# Print result and exit +# +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_pytorch_python_runtime/unit_test.py b/qa/L0_pytorch_python_runtime/unit_test.py new file mode 100755 index 0000000000..5b69f23a8a --- /dev/null +++ b/qa/L0_pytorch_python_runtime/unit_test.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys +import unittest + +import torch + +# satisfy Python runtime import requirements +sys.modules["triton_python_backend_utils"] = unittest.mock.MagicMock() +# import modules from Python runtime to be tested +from py_runtime import _gather_torch_tensors, _scatter_torch_tensors + + +class PyTorchPythonBackendRuntimeUnittest(unittest.TestCase): + # _gather_scatter_cases: [(tensors_scatter, tensors_gather, sections), ...] + # tensors_scatter: [an_infer_request, ...] + # an_infer_request: [a_torch_tensor_with_batch_dimension, ...] + # tensors_gather: [a_torch_tensor_gathering_all_requests, ...] + # sections: [batch_size_of_the_corresponding_infer_request, ...] + _gather_scatter_cases = [ + # shape [batch=1, 1] + ([[torch.tensor([[1]])]], [torch.tensor([[1]])], [1]), + # shape [batch=1, 2] + ([[torch.tensor([[1, 2]])]], [torch.tensor([[1, 2]])], [1]), + # shape [batch=1, 2, 4] + ([[torch.arange(8).reshape(1, 2, 4)]], [torch.arange(8).reshape(1, 2, 4)], [1]), + # shape [batch=3, 1] + ([[torch.arange(3).reshape(3, 1)]], [torch.arange(3).reshape(3, 1)], [3]), + # shapes ([batch=1, 1], [batch=1, 2]) + ( + [[torch.tensor([[1]]), torch.tensor([[2, 3]])]], + [torch.tensor([[1]]), torch.tensor([[2, 3]])], + [1], + ), + # scatter shape [batch=1, 1] x 2 -> gather shape [batch=2, 1] + ( + [[torch.tensor([[1]])], [torch.tensor([[2]])]], + [torch.tensor([[1], [2]])], + [1, 1], + ), + # scatter shape [batch=1, 2, 1] x 3 -> gather shape [batch=3, 2, 1] + ( + [[torch.tensor([[[i], [i + 3]]])] for i in range(3)], + [torch.tensor([[[0], [3]], [[1], [4]], [[2], [5]]])], + [1, 1, 1], + ), + # scatter shape [batch=1, 1] & [batch=2, 1] -> gather shape [batch=3, 1] + ( + [[torch.tensor([[1]])], [torch.tensor([[2], [3]])]], + [torch.tensor([[1], [2], [3]])], + [1, 2], + ), + # scatter shape [batch=3, 1, 1] & [batch=1, 1, 1] & [batch=2, 1, 1] + # -> gather shape [batch=6, 1, 1] + ( + [ + [torch.tensor([[[0]], [[1]], [[2]]])], + [torch.tensor([[[3]]])], + [torch.tensor([[[4]], [[5]]])], + ], + [torch.arange(6).reshape(6, 1, 1)], + [3, 1, 2], + ), + # scatter shapes ([batch=3, 1, 1], [batch=3, 2]) & ([batch=2, 1, 1], [batch=2, 2]) + # -> gather shapes ([batch=5, 1, 1], [batch=5, 2]) + ( + [ + [ + torch.tensor([[[0]], [[1]], [[2]]]), + torch.tensor([[5, 6], [7, 8], [9, 10]]), + ], + [torch.tensor([[[3]], [[4]]]), torch.tensor([[11, 12], [13, 14]])], + ], + [ + torch.arange(5).reshape(5, 1, 1), + torch.arange(start=5, end=15).reshape(5, 2), + ], + [3, 2], + ), + ] + + def test_gather_torch_tensors(self): + for ( + tensors_scatter, + expected_tensors_gather, + expected_sections, + ) in self._gather_scatter_cases: + tensors_gather, sections = _gather_torch_tensors(tensors_scatter) + + self.assertIsInstance(tensors_gather, list) + self.assertEqual(len(tensors_gather), len(expected_tensors_gather)) + for j in range(len(expected_tensors_gather)): + expected_tensor = expected_tensors_gather[j] + tensor = tensors_gather[j] + self.assertIsInstance(tensor, torch.Tensor) + self.assertTrue(torch.equal(tensor, expected_tensor)) + + self.assertIsInstance(sections, list) + self.assertEqual(len(sections), len(expected_sections)) + for i in range(len(expected_sections)): + expected_section = expected_sections[i] + section = sections[i] + self.assertIsInstance(section, int) + self.assertEqual(section, expected_section) + + def test_scatter_torch_tensors(self): + for ( + expected_tensors_scatter, + tensors_gather, + sections, + ) in self._gather_scatter_cases: + tensors_scatter = _scatter_torch_tensors(tensors_gather, sections) + self.assertIsInstance(tensors_scatter, list) + self.assertEqual(len(tensors_scatter), len(expected_tensors_scatter)) + for i in range(len(expected_tensors_scatter)): + expected_tensors = expected_tensors_scatter[i] + tensors = tensors_scatter[i] + self.assertIsInstance(tensors, list) + self.assertEqual(len(tensors), len(expected_tensors)) + for j in range(len(expected_tensors)): + expected_tensor = expected_tensors[j] + tensor = tensors[j] + self.assertIsInstance(tensor, torch.Tensor) + self.assertTrue(torch.equal(tensor, expected_tensor)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_query/models/query/config.pbtxt b/qa/L0_query/models/query/config.pbtxt new file mode 100644 index 0000000000..5251ab0422 --- /dev/null +++ b/qa/L0_query/models/query/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +name: "query" +backend: "query" +max_batch_size: 0 +input [ + { + name: "INPUT" + data_type: TYPE_UINT8 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_UINT8 + dims: [ -1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_UINT8 + dims: [ -1 ] + } +] diff --git a/qa/L0_query/query_e2e.py b/qa/L0_query/query_e2e.py new file mode 100755 index 0000000000..048a4a8d41 --- /dev/null +++ b/qa/L0_query/query_e2e.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as tritongrpcclient +import tritonclient.http as tritonhttpclient +from tritonclient.utils import InferenceServerException +from tritonclient.utils import cuda_shared_memory as cudashm + + +class QueryTest(tu.TestResultCollector): + def test_http(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8")) + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) + + try: + triton_client.infer(model_name="query", inputs=inputs) + self.assertTrue(False, "expect error with query information") + except InferenceServerException as ex: + self.assertTrue("OUTPUT0 CPU 0" in ex.message()) + self.assertTrue("OUTPUT1 CPU 0" in ex.message()) + + def test_http_shared_memory(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8")) + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) + + # Set up CUDA shared memory for outputs + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 4, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 4, 0) + triton_client.register_cuda_shared_memory( + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4 + ) + triton_client.register_cuda_shared_memory( + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4 + ) + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs[-1].set_shared_memory("output0_data", 4) + + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + outputs[-1].set_shared_memory("output1_data", 4) + + try: + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) + self.assertTrue(False, "expect error with query information") + except InferenceServerException as ex: + self.assertTrue("OUTPUT0 GPU 0" in ex.message()) + self.assertTrue("OUTPUT1 GPU 0" in ex.message()) + + cudashm.destroy_shared_memory_region(shm_op0_handle) + cudashm.destroy_shared_memory_region(shm_op1_handle) + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + + def test_http_out_of_shared_memory(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8")) + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) + + # Set up too small CUDA shared memory for outputs, expect query + # returns default value + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 1, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 1, 0) + triton_client.register_cuda_shared_memory( + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1 + ) + triton_client.register_cuda_shared_memory( + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1 + ) + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs[-1].set_shared_memory("output0_data", 1) + + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + outputs[-1].set_shared_memory("output1_data", 1) + + try: + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) + self.assertTrue(False, "expect error with query information") + except InferenceServerException as ex: + self.assertTrue("OUTPUT0 CPU 0" in ex.message()) + self.assertTrue("OUTPUT1 CPU 0" in ex.message()) + + cudashm.destroy_shared_memory_region(shm_op0_handle) + cudashm.destroy_shared_memory_region(shm_op1_handle) + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + + def test_grpc(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8")) + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) + + try: + triton_client.infer(model_name="query", inputs=inputs) + self.assertTrue(False, "expect error with query information") + except InferenceServerException as ex: + self.assertTrue("OUTPUT0 CPU 0" in ex.message()) + self.assertTrue("OUTPUT1 CPU 0" in ex.message()) + + def test_grpc_shared_memory(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8")) + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) + + # Set up CUDA shared memory for outputs + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 4, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 4, 0) + triton_client.register_cuda_shared_memory( + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4 + ) + triton_client.register_cuda_shared_memory( + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4 + ) + outputs = [] + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) + outputs[-1].set_shared_memory("output0_data", 4) + + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) + outputs[-1].set_shared_memory("output1_data", 4) + + try: + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) + self.assertTrue(False, "expect error with query information") + except InferenceServerException as ex: + self.assertTrue("OUTPUT0 GPU 0" in ex.message()) + self.assertTrue("OUTPUT1 GPU 0" in ex.message()) + + cudashm.destroy_shared_memory_region(shm_op0_handle) + cudashm.destroy_shared_memory_region(shm_op1_handle) + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + + def test_grpc_out_of_shared_memory(self): + triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") + inputs = [] + inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8")) + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) + + # Set up too small CUDA shared memory for outputs, expect query + # returns default value + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 1, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 1, 0) + triton_client.register_cuda_shared_memory( + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1 + ) + triton_client.register_cuda_shared_memory( + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1 + ) + outputs = [] + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) + outputs[-1].set_shared_memory("output0_data", 1) + + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) + outputs[-1].set_shared_memory("output1_data", 1) + + try: + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) + self.assertTrue(False, "expect error with query information") + except InferenceServerException as ex: + self.assertTrue("OUTPUT0 CPU 0" in ex.message()) + self.assertTrue("OUTPUT1 CPU 0" in ex.message()) + + cudashm.destroy_shared_memory_region(shm_op0_handle) + cudashm.destroy_shared_memory_region(shm_op1_handle) + triton_client.unregister_system_shared_memory() + triton_client.unregister_cuda_shared_memory() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_query/test.sh b/qa/L0_query/test.sh new file mode 100755 index 0000000000..153cd69381 --- /dev/null +++ b/qa/L0_query/test.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +TEST_LOG="./query_test.log" +CLIENT_LOG="./query_client.log" +TEST_EXEC=./query_test +TEST_PY=./query_e2e.py +EXPECTED_NUM_TESTS="6" +TEST_RESULT_FILE='test_results.txt' + + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +unset TEST_FAIL_WITH_QUERY_RESULT +unset TEST_BYTE_SIZE + +set +e +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Query Unit Test Failed\n***" + RET=1 +fi +set -e + +export TEST_FAIL_WITH_QUERY_RESULT=1 +export TEST_BYTE_SIZE=4 + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST_PY >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +unset TEST_FAIL_WITH_QUERY_RESULT +unset TEST_BYTE_SIZE + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_rate_limiter/rate_limiter_test.py b/qa/L0_rate_limiter/rate_limiter_test.py new file mode 100755 index 0000000000..309e7f0a3d --- /dev/null +++ b/qa/L0_rate_limiter/rate_limiter_test.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import functools +import os +import threading +import time +import unittest + +import numpy as np +import sequence_util as su +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +_inference_count = 80 +_inference_concurrency = 8 +_response_wait_time_s = 10 +_finish_wait_time_s = 10 +_exit_signal = False + + +class AsyncGrpcRunner: + def __init__(self, tester, server_url, model_name, delay_ms): + self._tester = tester + self._server_url = server_url + self._model_name = model_name + self._delay_ms = delay_ms + + self._input_data = [] + self._shape = [1, 1] + self._dtype = np.float32 + self._results = {} + self._processed_all = False + self._errors = [] + self._inflight_requests = 0 + self._num_sent_request = 0 + self._processed_request_count = 0 + self._sync = threading.Condition() + self._req_thread = threading.Thread(target=self.req_loop, daemon=True) + + def _on_result(self, result, error): + with self._sync: + if error: + self._errors.append(error) + else: + this_id = int(result.get_response().id) + self._results[this_id] = result + self._inflight_requests -= 1 + self._sync.notify_all() + + def req_loop(self): + client = grpcclient.InferenceServerClient(self._server_url) + + inputs = [ + grpcclient.InferInput( + "INPUT0", self._shape, np_to_triton_dtype(self._dtype) + ) + ] + + self._inflight_requests = 0 + start_stat = client.get_inference_statistics(model_name=self._model_name) + global _exit_signal + + while not _exit_signal: + input_numpy = np.random.random_sample(self._shape).astype(self._dtype) + inputs[0].set_data_from_numpy(input_numpy) + self._input_data.append(input_numpy) + + with self._sync: + + def _check_can_send(): + return self._inflight_requests < _inference_concurrency + + can_send = self._sync.wait_for( + _check_can_send, timeout=_response_wait_time_s + ) + self._tester.assertTrue( + can_send, + "client didn't receive a response within {}s".format( + _response_wait_time_s + ), + ) + + callback = functools.partial(AsyncGrpcRunner._on_result, self) + client.async_infer( + model_name=self._model_name, + inputs=inputs, + request_id="{}".format(self._num_sent_request), + callback=callback, + ) + self._inflight_requests += 1 + self._num_sent_request += 1 + if self._num_sent_request == _inference_count: + _exit_signal = True + time.sleep(self._delay_ms / 1000.0) + + # wait till receive all requested data + with self._sync: + + def _all_processed(): + return self._inflight_requests == 0 + + self._processed_all = self._sync.wait_for( + _all_processed, _finish_wait_time_s + ) + self._tester.assertTrue( + self._processed_all, + "the processing didn't complete even after waiting for {}s".format( + _finish_wait_time_s + ), + ) + + end_stat = client.get_inference_statistics(model_name=self._model_name) + self._processed_request_count = ( + end_stat.model_stats[0].inference_stats.success.count + - start_stat.model_stats[0].inference_stats.success.count + ) + + def start(self): + self._req_thread.start() + + def _validate_run(self): + if len(self._errors) != 0: + raise self._errors[0] + self._tester.assertEqual( + len(self._input_data), + len(self._results.keys()), + "the number of inputs and output should match", + ) + for i in range(len(self._input_data)): + self._tester.assertFalse( + (self._input_data[i] != self._results[i].as_numpy("OUTPUT0")).any(), + "the output data should match with the input data", + ) + + def join(self): + self._req_thread.join() + self._validate_run() + + +class RateLimiterTest(su.SequenceBatcherTestUtil): + def stress_models(self, model_names, delay_ms=0): + infer_counts = {} + try: + runners = [] + for model_name in model_names: + runners.append( + AsyncGrpcRunner( + self, "localhost:8001", model_name, delay_ms=delay_ms + ) + ) + for r in runners: + r.start() + for r in runners: + r.join() + infer_counts[r._model_name] = r._processed_request_count + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + return infer_counts + + def test_single_model(self): + # Send all the inference requests to a single model. + # Simple sanity check. + + model_names = ["custom_zero_1_float32"] + infer_counts = self.stress_models(model_names) + + self.assertEqual(infer_counts[model_names[0]], _inference_count) + + def test_cross_model_prioritization_limited_resource(self): + # Sends requests to two models, one operating at + # priority of 1 and other at 2 respectively. + # The available resource counts doesn't allow models + # to execute simultaneously. + + model_names = ["custom_zero_1_float32", "custom_zero_1_float32_v2"] + + # TODO: Validate the priority and resource counts are set correctly + + infer_counts = self.stress_models(model_names) + infer_ratio = infer_counts[model_names[0]] / float(infer_counts[model_names[1]]) + + self.assertGreater( + infer_ratio, + 1.80, + "Got infer ratio across models {}, expected closer to 2".format( + infer_ratio + ), + ) + + def test_cross_model_prioritization_plenty_resource(self): + # Sends requests to two models, one operating at + # priority of 1 and other at 2 respectively. + # The available resource counts wll allow both models + # to run simultaneously. + + model_names = ["custom_zero_1_float32", "custom_zero_1_float32_v2"] + + # TODO: Validate the priority and resource counts are set correctly + + infer_counts = self.stress_models(model_names) + infer_diff = abs(infer_counts[model_names[0]] - infer_counts[model_names[1]]) + + self.assertGreater( + 10, + infer_diff, + "Got infer difference between models {}, expected closer to 0".format( + infer_diff + ), + ) + + def test_single_model_dynamic_batching(self): + # Send all the inference requests with a delay to a model + + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + model_names = ["custom_zero_1_float32"] + infer_counts = self.stress_models(model_names, delay_ms=100) + + self.assertEqual(infer_counts[model_names[0]], _inference_count) + + # Check whether all requests used batch size of 4 or not + client = grpcclient.InferenceServerClient("localhost:8001") + stats = client.get_inference_statistics(model_names[0], "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + + batch_stats = stats.model_stats[0].batch_stats + self.assertEqual( + len(batch_stats), + 1, + "expected single batch-size, got {}".format(len(batch_stats)), + ) + + for batch_stat in batch_stats: + self.assertEqual( + batch_stat.batch_size, + 4, + "unexpected batch-size {}".format(batch_stat.batch_size), + ) + # Get count from one of the stats + self.assertEqual( + batch_stat.compute_infer.count, + _inference_count / 4, + "expected model-execution-count {} for batch size {}, got {}".format( + _inference_count / 4, 4, batch_stat.compute_infer.count + ), + ) + + def test_single_model_sequence_batching(self): + # Send one sequence and check for correct accumulator + # result. The result should be returned immediately. + # This test checks whether all the requests are + # directed to the same instance. + + try: + model_name = "custom_sequence_int32" + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.check_sequence( + "custom", + model_name, + np.int32, + 5, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + ( + ("start", 1, None, None), + (None, 2, None, None), + (None, 3, None, None), + (None, 4, None, None), + (None, 5, None, None), + (None, 6, None, None), + (None, 7, None, None), + (None, 8, None, None), + ("end", 9, None, None), + ), + 45, + "grpc", + ) + + self.check_deferred_exception() + self.check_status(model_name, {1: 9}, 9, 9) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_rate_limiter/test.sh b/qa/L0_rate_limiter/test.sh new file mode 100755 index 0000000000..334af99e4c --- /dev/null +++ b/qa/L0_rate_limiter/test.sh @@ -0,0 +1,472 @@ +#!/bin/bash +# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +RATE_LIMITER_TEST=rate_limiter_test.py +TEST_RESULT_FILE='test_results.txt' + +MODELDIR=${MODELDIR:=`pwd`} +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends + + +SERVER_ARGS_EXTRA="--backend-directory=${BACKEND_DIR}" +source ../common/util.sh + +RET=0 + +rm -f *.log +rm -fr ./custom_models && mkdir ./custom_models && \ +cp -r ../custom_models/custom_zero_1_float32 ./custom_models/. && \ +cp -r ../custom_models/custom_sequence_int32 ./custom_models/. && \ +mkdir -p ./custom_models/custom_zero_1_float32/1 && \ +cp -r ./custom_models/custom_zero_1_float32 ./custom_models/custom_zero_1_float32_v2 + + +(cd custom_models/custom_zero_1_float32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + sed -i "s/max_batch_size:.*/max_batch_size: 4/g" config.pbtxt && \ + echo "instance_group [{" >> config.pbtxt && \ + echo "kind: KIND_GPU count: 1" >> config.pbtxt && \ + echo "rate_limiter { resources [{name: \"resource1\" count: 4 }]}" >> config.pbtxt && \ + echo "}]" >> config.pbtxt && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"100\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + + +(cd custom_models/custom_zero_1_float32_v2 && \ + sed -i "s/custom_zero_1_float32/custom_zero_1_float32_v2/g" config.pbtxt && \ + sed -i "s/dims:.*\[.*\]/dims: \[ -1 \]/g" config.pbtxt && \ + sed -i "s/max_batch_size:.*/max_batch_size: 4/g" config.pbtxt && \ + echo "instance_group [{" >> config.pbtxt && \ + echo "kind: KIND_GPU count: 1" >> config.pbtxt && \ + echo "rate_limiter { resources [{name: \"resource1\" count: 2 }, {name: \"resource2\" global: True count: 2 }] priority: 2}" >> config.pbtxt && \ + echo "}]" >> config.pbtxt && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"100\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +## +## Test cases that fails to load models +## +# Case1: Both resource lesser than required +SERVER_ARGS="--rate-limit=execution_count --rate-limit-resource=resource1:1 --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server_r1.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected success with resource count 1\n***" + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi + +set +e +grep "Resource count for \"resource1\" is limited to 1 which will prevent scheduling of one or more model instances, the minimum required count is 4" $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed. Expected error message while loading the model \"custom_zero_1_float32\"\n***" + RET=1 +fi + +set -e + +# Case2: resources sufficient only for one model +SERVER_ARGS="--rate-limit=execution_count --rate-limit-resource=resource1:3 --rate-limit-resource=resource2:2 --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server_r3.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected success with resource count 1\n***" + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi + +set +e +grep "Resource count for \"resource1\" is limited to 3 which will prevent scheduling of one or more model instances, the minimum required count is 4" $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed. Expected error message while loading the model \"custom_zero_1_float32\"\n***" + RET=1 +fi + +set -e + +# Case3: Resource specified only for specific device id 10 and not for the GPU that loads the model instance. +SERVER_ARGS="--rate-limit=execution_count --rate-limit-resource=resource1:10:10 --rate-limit-resource=resource2:2 --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server_rdevice.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected success with resource count 1\n***" + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi + +set +e +grep "Resource count for \"resource1\" is limited to 0 which will prevent scheduling of one or more model instances, the minimum required count is 4" $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed. Expected error message while loading the model \"custom_zero_1_float32\"\n***" + RET=1 +fi + +set -e + +# Case4: Conflicting resource types in the config +cp -r ./custom_models/custom_zero_1_float32_v2 ./custom_models/custom_zero_1_float32_v3 +(cd custom_models/custom_zero_1_float32_v3 && \ + sed -i "s/custom_zero_1_float32_v2/custom_zero_1_float32_v3/g" config.pbtxt && \ + sed -i "s/global: True/global: False/g " config.pbtxt) + +SERVER_ARGS="--rate-limit=execution_count --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server_conflict.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected success with resource count 1\n***" + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi + +set +e +grep "Resource \"resource2\" is present as both global and device-specific resource in the model configuration." $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed. Expected error message for conflicting resource types\n***" + RET=1 +fi +rm -rf ./custom_models/custom_zero_1_float32_v3 + +set -e + +## +## Tests with cross-model prioritization with various cases: +## +# CASE1: Explicit limited resource: only allows one model to run at a time +SERVER_ARGS="--rate-limit=execution_count --rate-limit-resource=resource1:4 --rate-limit-resource=resource2:2 --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 $RATE_LIMITER_TEST RateLimiterTest.test_cross_model_prioritization_limited_resource >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# CASE2: Implicit Limited resource: By default, server will select max resources of one of the +# model as available resource. This means only one model will run at a time. +SERVER_ARGS="--rate-limit=execution_count --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 $RATE_LIMITER_TEST RateLimiterTest.test_cross_model_prioritization_limited_resource >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# CASE3: Explicit plenty resource: Allows multiple models to run simultaneously +SERVER_ARGS="--rate-limit=execution_count --rate-limit-resource=resource1:6 --rate-limit-resource=resource2:2 --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python3 $RATE_LIMITER_TEST RateLimiterTest.test_cross_model_prioritization_plenty_resource >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +## +## Tests with multiple instances of the same model +## +# Replace the second model with a second instance with same resource requirements and priority. +# TODO: Currently there is no way to check which instance got to run inferences hence we only +# check the resource constraint. Add more extensive tests for multiple instances once required +# information is made available. +rm -rf custom_models/custom_zero_1_float32_v2 +(cd custom_models/custom_zero_1_float32 && \ + echo "instance_group [{" >> config.pbtxt && \ + echo "kind: KIND_GPU count: 1" >> config.pbtxt && \ + echo "rate_limiter { resources [{name: \"resource1\" count: 2 }, {name: \"resource2\" global: True count: 2 }] priority: 2}" >> config.pbtxt && \ + echo "}]" >> config.pbtxt) + +# CASE1: limited resource: only allows one model instance to run at a time. +SERVER_ARGS="--rate-limit=execution_count --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SECONDS=0 +python3 $RATE_LIMITER_TEST RateLimiterTest.test_single_model >>$CLIENT_LOG 2>&1 +LIMITED_RESOURCE_TEST_DURATION=$SECONDS +echo -e "Limited resource time: ${LIMITED_RESOURCE_TEST_DURATION}s" +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# CASE 2: plenty resource: allows both the instances to run simultaneously +SERVER_ARGS="--rate-limit=execution_count --rate-limit-resource=resource1:6 --rate-limit-resource=resource2:2 --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SECONDS=0 +python3 $RATE_LIMITER_TEST RateLimiterTest.test_single_model >>$CLIENT_LOG 2>&1 +PLENTY_RESOURCE_TEST_DURATION=$SECONDS +echo -e "Plenty resource time: ${LIMITED_RESOURCE_TEST_DURATION}s" +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +if [ $PLENTY_RESOURCE_TEST_DURATION -gt $LIMITED_RESOURCE_TEST_DURATION ]; then + echo -e "Error: Test with limited resources should take more time" + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Case 3: resources sufficient only for one model instance. Hence, should fail to load +SERVER_ARGS="--rate-limit=execution_count --rate-limit-resource=resource1:3 --rate-limit-resource=resource2:2 --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server_r3i.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected success with resource count 1\n***" + RET=1 + + kill $SERVER_PID + wait $SERVER_PID +fi +grep "Resource count for \"resource1\" is limited to 3 which will prevent scheduling of one or more model instances, the minimum required count is 4" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error message while loading the model \"custom_zero_1_float32\"\n***" + RET=1 +fi + +## +## Tests with dynamic batching +## +# Despite all the possible bs being preferred triton should always form full batches as +# the second instance would be blocked because of the resource constraints. +(cd custom_models/custom_zero_1_float32 && \ + sed -i "s/.*execute_delay_ms.*/{ key: \"execute_delay_ms\"; value: { string_value: \"1000\" }}/g" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [ 1, 2, 3, 4 ]" >> config.pbtxt && \ + echo " max_queue_delay_microseconds: 5000000 }" >> config.pbtxt) +export TRITONSERVER_DELAY_SCHEDULER=8 +SERVER_ARGS="--rate-limit=execution_count --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 $RATE_LIMITER_TEST RateLimiterTest.test_single_model_dynamic_batching >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +unset TRITONSERVER_DELAY_SCHEDULER + +## +## Tests with sequence batching +## +# Send one sequence and check for correct accumulator result. The result should be returned immediately. +# This test checks whether all the requests are directed to the same instance despite there being other +# instances with higher priority. +FIRST_INSTANCE_RESOURCE="rate_limiter { resources [{name: \"resource1\" count: 4 }]}" +(cd custom_models/custom_sequence_int32/ && \ + sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 1000000/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_CPU\\ncount: 1 \n${FIRST_INSTANCE_RESOURCE}/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1 \n${FIRST_INSTANCE_RESOURCE}/" config.pbtxt &&\ + echo "instance_group [{" >> config.pbtxt && \ + echo "kind: KIND_CPU count: 1" >> config.pbtxt && \ + echo "rate_limiter { resources [{name: \"resource1\" count: 2 }, {name: \"resource2\" global: True count: 2 }] priority: 2}" >> config.pbtxt && \ + echo "}]" >> config.pbtxt && \ + echo "instance_group [{" >> config.pbtxt && \ + echo "kind: KIND_CPU count: 2" >> config.pbtxt && \ + echo "rate_limiter { resources [{name: \"resource1\" count: 2 }, {name: \"resource2\" global: True count: 2 }] priority: 3}" >> config.pbtxt && \ + echo "}]" >> config.pbtxt) +SERVER_ARGS="--rate-limit=execution_count --model-repository=$MODELDIR/custom_models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python3 $RATE_LIMITER_TEST RateLimiterTest.test_single_model_sequence_batching >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_register/config.pbtxt b/qa/L0_register/config.pbtxt new file mode 100644 index 0000000000..4ba0081da8 --- /dev/null +++ b/qa/L0_register/config.pbtxt @@ -0,0 +1,49 @@ +# Copyright 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "model" +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_register/test.sh b/qa/L0_register/test.sh new file mode 100755 index 0000000000..6a5a4123ad --- /dev/null +++ b/qa/L0_register/test.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +TEST_LOG="./register_api_test.log" +TEST_EXEC=./register_api_test + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +# Setup repositories for testing, note that we use +# model version as hint for which directory is used for model loading +mkdir empty_models models_0 models_1 +mkdir -p models_0/model_0/1 && \ + cp config.pbtxt models_0/model_0/. && \ + (cd models_0/model_0 && \ + sed -i "s/^name:.*/name: \"model_0\"/" config.pbtxt) +mkdir -p models_1/model_0/2 && \ + cp config.pbtxt models_1/model_0/. && \ + (cd models_1/model_0 && \ + sed -i "s/^name:.*/name: \"model_0\"/" config.pbtxt) +mkdir -p models_1/model_1/3 && \ + cp config.pbtxt models_1/model_1/. && \ + (cd models_1/model_1 && \ + sed -i "s/^name:.*/name: \"model_1\"/" config.pbtxt) + +set +e +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $TEST_EXEC >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Register API Unit Test Failed\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_repoagent_checksum/identity_test.py b/qa/L0_repoagent_checksum/identity_test.py new file mode 100755 index 0000000000..95a0639edf --- /dev/null +++ b/qa/L0_repoagent_checksum/identity_test.py @@ -0,0 +1,113 @@ +#!/usr/bin/python + +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import sys + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import np_to_triton_dtype + +FLAGS = None + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", "--url", type=str, required=False, help="Inference server URL." + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + + FLAGS = parser.parse_args() + if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) + exit(1) + + client_util = httpclient if FLAGS.protocol == "http" else grpcclient + + if FLAGS.url is None: + FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" + + # Reuse a single client for all sync tests + with client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) as client: + for model_name, np_dtype, shape in ( + # yapf: disable + ("identity_int32", np.int32, [0]), + ("identity_int32", np.int32, [7]) + ): + # yapf: enable + if np_dtype != object: + input_data = (16384 * np.random.randn(*shape)).astype(np_dtype) + else: + in0 = 16384 * np.ones(shape, dtype="int") + in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) + input_data = in0n.reshape(in0.shape) + inputs = [ + client_util.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + + results = client.infer(model_name, inputs) + print(results) + + # Make sure outputs are expected value + output_data = results.as_numpy("OUTPUT0") + if output_data is None: + print("error: expected 'OUTPUT0'") + sys.exit(1) + + if np_dtype == object: + output_data = np.char.decode(output_data) + + if not np.array_equal(output_data, input_data): + print( + "error: expected output {} to match input {}".format( + output_data, input_data + ) + ) + sys.exit(1) diff --git a/qa/L0_repoagent_checksum/models/identity_int32/config.pbtxt b/qa/L0_repoagent_checksum/models/identity_int32/config.pbtxt new file mode 100644 index 0000000000..0c6edea6a7 --- /dev/null +++ b/qa/L0_repoagent_checksum/models/identity_int32/config.pbtxt @@ -0,0 +1,67 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_int32" +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +model_repository_agents +{ + agents [ + { + name: "checksum", + parameters [ + { + key: "MD5:1/libtriton_identity.so", + value: "invalid_checksum" + }, + { + key: "MD5:data_file", + value: "4e41030bb1531cd68b2c0277b0aad2e9" + } + ] + } + ] +} \ No newline at end of file diff --git a/qa/L0_repoagent_checksum/models/identity_int32/data_file b/qa/L0_repoagent_checksum/models/identity_int32/data_file new file mode 100644 index 0000000000..e08163d46f --- /dev/null +++ b/qa/L0_repoagent_checksum/models/identity_int32/data_file @@ -0,0 +1,28 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This file is treated as some other files needed by the model +and thus the repo agent should also verify its checksum. diff --git a/qa/L0_repoagent_checksum/test.sh b/qa/L0_repoagent_checksum/test.sh new file mode 100755 index 0000000000..279cce303e --- /dev/null +++ b/qa/L0_repoagent_checksum/test.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_PY=./identity_test.py +CLIENT_LOG="./client.log" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -fr *.log + +RET=0 + +# The config is set with invalid checksum, so expect server failed to +# load all models +run_server +if [ "$SERVER_PID" == "0" ]; then + set +e + grep "'identity_int32': Mismatched MD5 hash for file 1/libtriton_identity.so" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on mismatched MD5 hash\n***" + cat $SERVER_LOG + RET=1 + fi + set -e +else + echo -e "\n***\n*** Expect fail to start $SERVER\n***" + cat $SERVER_LOG + kill $SERVER_PID + wait $SERVER_PID + exit 1 +fi + +# Set correct md5sum +(cd models/identity_int32 && \ + model_hash=$(md5sum 1/libtriton_identity.so | cut -d' ' -f 1); sed -i "s/invalid_checksum/${model_hash}/" config.pbtxt +) + +# Server should run successfully +rm -fr *.log +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** fail to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +for PROTOCOL in http grpc; do + set +e + python $CLIENT_PY -i $PROTOCOL -v >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + set -e +done + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $SERVER_LOG + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_request_cancellation/grpc_cancellation_test.py b/qa/L0_request_cancellation/grpc_cancellation_test.py new file mode 100755 index 0000000000..4b103e21e1 --- /dev/null +++ b/qa/L0_request_cancellation/grpc_cancellation_test.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import asyncio +import queue +import re +import time +import unittest +from functools import partial + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.grpc.aio as grpcclientaio +from tritonclient.utils import InferenceServerException + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class GrpcCancellationTest(unittest.IsolatedAsyncioTestCase): + _model_name = "custom_identity_int32" + _model_delay = 10.0 # seconds + _grpc_params = {"url": "localhost:8001", "verbose": True} + + def setUp(self): + self._client = grpcclient.InferenceServerClient(**self._grpc_params) + self._client_aio = grpcclientaio.InferenceServerClient(**self._grpc_params) + self._user_data = UserData() + self._callback = partial(callback, self._user_data) + self._prepare_request() + self._start_time = time.time() # seconds + self.test_duration_delta = 0.5 + + def tearDown(self): + self._end_time = time.time() # seconds + self._assert_max_duration() + + def _prepare_request(self): + self._inputs = [] + self._inputs.append(grpcclient.InferInput("INPUT0", [1, 1], "INT32")) + self._outputs = [] + self._outputs.append(grpcclient.InferRequestedOutput("OUTPUT0")) + self._inputs[0].set_data_from_numpy(np.array([[10]], dtype=np.int32)) + + def _assert_max_duration(self): + max_duration = self._model_delay * self.test_duration_delta # seconds + duration = self._end_time - self._start_time # seconds + self.assertLess( + duration, + max_duration, + f"test runtime expected less than {max_duration}s response time, got {duration}s", + ) + + def _assert_callback_cancelled(self): + self.assertFalse(self._user_data._completed_requests.empty()) + data_item = self._user_data._completed_requests.get() + self.assertIsInstance(data_item, InferenceServerException) + self.assertIn("Locally cancelled by application!", str(data_item)) + + def test_grpc_async_infer(self): + future = self._client.async_infer( + model_name=self._model_name, + inputs=self._inputs, + callback=self._callback, + outputs=self._outputs, + ) + time.sleep(2) # ensure the inference has started + future.cancel() + time.sleep(0.1) # context switch + self._assert_callback_cancelled() + + def test_grpc_stream_infer(self): + self._client.start_stream(callback=self._callback) + self._client.async_stream_infer( + model_name=self._model_name, inputs=self._inputs, outputs=self._outputs + ) + time.sleep(2) # ensure the inference has started + self._client.stop_stream(cancel_requests=True) + self._assert_callback_cancelled() + + async def test_aio_grpc_async_infer(self): + infer_task = asyncio.create_task( + self._client_aio.infer( + model_name=self._model_name, inputs=self._inputs, outputs=self._outputs + ) + ) + await asyncio.sleep(2) # ensure the inference has started + infer_task.cancel() + with self.assertRaises(asyncio.CancelledError): + await infer_task + + async def test_aio_grpc_stream_infer(self): + async def requests_generator(): + yield { + "model_name": self._model_name, + "inputs": self._inputs, + "outputs": self._outputs, + } + + responses_iterator = self._client_aio.stream_infer(requests_generator()) + await asyncio.sleep(2) # ensure the inference has started + self.assertTrue(responses_iterator.cancel()) + with self.assertRaises(asyncio.CancelledError): + async for result, error in responses_iterator: + self._callback(result, error) + + def test_grpc_async_infer_cancellation_at_step_start(self): + # This is a longer test + self.test_duration_delta = 4.5 + server_log_name = "grpc_cancellation_test.test_grpc_async_infer_cancellation_at_step_start.server.log" + with open(server_log_name, "r") as f: + server_log = f.read() + + prev_new_req_handl_count = len( + re.findall("New request handler for ModelInferHandler", server_log) + ) + self.assertEqual( + prev_new_req_handl_count, + 2, + "Expected 2 request handler for ModelInferHandler log entries, but got {}".format( + prev_new_req_handl_count + ), + ) + future = self._client.async_infer( + model_name=self._model_name, + inputs=self._inputs, + callback=self._callback, + outputs=self._outputs, + ) + time.sleep(2) # ensure the inference request reached server + future.cancel() + # ensures TRITONSERVER_DELAY_GRPC_PROCESS delay passed on the server + time.sleep(self._model_delay * 2) + + with open(server_log_name, "r") as f: + server_log = f.read() + + cancel_at_start_count = len( + re.findall( + r"Cancellation notification received for ModelInferHandler, rpc_ok=1, context \d+, \d+ step START", + server_log, + ) + ) + cur_new_req_handl_count = len( + re.findall("New request handler for ModelInferHandler", server_log) + ) + self.assertEqual( + cancel_at_start_count, + 2, + "Expected 2 cancellation at step START log entries, but got {}".format( + cancel_at_start_count + ), + ) + self.assertGreater( + cur_new_req_handl_count, + prev_new_req_handl_count, + "gRPC Cancellation on step START Test Failed: New request handler for ModelInferHandler was not created", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_request_cancellation/implicit_state_model/config.pbtxt b/qa/L0_request_cancellation/implicit_state_model/config.pbtxt new file mode 100644 index 0000000000..c5062dcee5 --- /dev/null +++ b/qa/L0_request_cancellation/implicit_state_model/config.pbtxt @@ -0,0 +1,77 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "pytorch" +max_batch_size: 1 + +input { + name: "DELAY_ITRS__0" + data_type: TYPE_INT64 + dims: [ 1 ] +} +output { + name: "DUMMY_OUT__0" + data_type: TYPE_INT64 + dims: [ 1 ] +} + +sequence_batching { + max_sequence_idle_microseconds: 6000000 + oldest { max_candidate_sequences: 1 } + control_input [ + { + name: "SEQ_START__1" + control { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + }, + { + name: "SEQ_ID__2" + control { + kind: CONTROL_SEQUENCE_CORRID + data_type: TYPE_INT64 + } + } + ] + state { + input_name: "SEQ_STATE_IN__3" + output_name: "SEQ_STATE_OUT__1" + data_type: TYPE_INT64 + dims: 1 + initial_state { + name: "initial_state" + data_type: TYPE_INT64 + dims: 1 + zero_data: true + } + } +} + +instance_group { + kind: KIND_CPU + count: 1 +} diff --git a/qa/L0_request_cancellation/implicit_state_model/gen_model.py b/qa/L0_request_cancellation/implicit_state_model/gen_model.py new file mode 100755 index 0000000000..c9b1b5bf29 --- /dev/null +++ b/qa/L0_request_cancellation/implicit_state_model/gen_model.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch + + +class ImplicitStateModel(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, delay_itrs, seq_start, seq_id, seq_state_in): + # if not sequence start, verify sequence state match sequence id + if not seq_start and seq_id != seq_state_in: + print( + f"[MODEL ERROR] Invalid sequence state, expect {seq_id}, got {seq_state_in}" + ) + # delay the execution + delay = 0 + for i in range(int(delay_itrs)): + delay += i + # set sequence state, do not modify state unless sequence starting + if seq_start: + seq_state_out = seq_id + else: + seq_state_out = seq_state_in + dummy_out = seq_state_out + return dummy_out, seq_state_out + + +if __name__ == "__main__": + torch.jit.save(torch.jit.script(ImplicitStateModel()), "model.pt") diff --git a/qa/L0_request_cancellation/implicit_state_model/model.pt b/qa/L0_request_cancellation/implicit_state_model/model.pt new file mode 100644 index 0000000000..73e8e2ac55 Binary files /dev/null and b/qa/L0_request_cancellation/implicit_state_model/model.pt differ diff --git a/qa/L0_request_cancellation/implicit_state_test.py b/qa/L0_request_cancellation/implicit_state_test.py new file mode 100755 index 0000000000..fcc5f7ae88 --- /dev/null +++ b/qa/L0_request_cancellation/implicit_state_test.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient + + +class TestImplicitState(unittest.TestCase): + def _get_inputs(self, delay_itrs): + shape = [1, 1] + inputs = [grpcclient.InferInput("DELAY_ITRS__0", shape, "INT64")] + inputs[0].set_data_from_numpy(np.array([[delay_itrs]], np.int64)) + return inputs + + def _generate_streaming_callback_and_response_pair(self): + response = [] # [{"result": result, "error": error}, ...] + + def callback(result, error): + response.append({"result": result, "error": error}) + + return callback, response + + def _sequence_state_model_infer(self, num_reqs, seq_ids, delay_itrs, cancel_reqs): + model_name = "sequence_state" + callback, response = self._generate_streaming_callback_and_response_pair() + with grpcclient.InferenceServerClient("localhost:8001") as client: + client.start_stream(callback) + seq_start = True + for req_id in range(num_reqs): + for seq_id in seq_ids: + client.async_stream_infer( + model_name, + self._get_inputs(delay_itrs), + sequence_id=seq_id, + sequence_start=seq_start, + ) + time.sleep(0.1) + seq_start = False + client.stop_stream(cancel_requests=cancel_reqs) + return response + + # Test timeout is reset for a sequence slot after its sequence is cancelled + def test_state_reset_after_cancel(self): + sequence_timeout = 6 # secs + # Start sequence 1 and cancel it + num_reqs = 10 + response = self._sequence_state_model_infer( + num_reqs, seq_ids=[1], delay_itrs=5000000, cancel_reqs=True + ) + self.assertLess( + len(response), + num_reqs, + "Precondition not met - sequence completed before cancellation", + ) + # Wait for sequence 1 to timeout + time.sleep(sequence_timeout + 2) + # Start sequence 2 and 3 + self._sequence_state_model_infer( + num_reqs=4, seq_ids=[2, 3], delay_itrs=0, cancel_reqs=False + ) + # Check for any unexpected sequence state mixing + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("[MODEL ERROR] Invalid sequence state", server_log) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_request_cancellation/scheduler_test.py b/qa/L0_request_cancellation/scheduler_test.py new file mode 100755 index 0000000000..900073ea7d --- /dev/null +++ b/qa/L0_request_cancellation/scheduler_test.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import concurrent.futures +import re +import time +import unittest + +import numpy as np +import requests +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class TestScheduler(unittest.TestCase): + def setUp(self): + # Initialize client + self._triton = grpcclient.InferenceServerClient("localhost:8001") + + def _get_inputs(self, batch_size): + self.assertIsInstance(batch_size, int) + self.assertGreater(batch_size, 0) + shape = [batch_size, 8] + inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")] + inputs[0].set_data_from_numpy(np.ones(shape, dtype=np.float32)) + return inputs + + def _generate_callback_and_response_pair(self): + response = {"responded": False, "result": None, "error": None} + + def callback(result, error): + response["responded"] = True + response["result"] = result + response["error"] = error + + return callback, response + + def _assert_response_is_cancelled(self, response): + self.assertTrue(response["responded"]) + self.assertEqual(response["result"], None) + self.assertIsInstance(response["error"], InferenceServerException) + self.assertEqual(response["error"].status(), "StatusCode.CANCELLED") + + def _generate_streaming_callback_and_response_pair(self): + response = [] # [{"result": result, "error": error}, ...] + + def callback(result, error): + response.append({"result": result, "error": error}) + + return callback, response + + def _assert_streaming_response_is_cancelled(self, response): + self.assertGreater(len(response), 0) + cancelled_count = 0 + for res in response: + result, error = res["result"], res["error"] + if error: + self.assertEqual(result, None) + self.assertIsInstance(error, InferenceServerException) + if error.status() == "StatusCode.CANCELLED": + cancelled_count += 1 + self.assertEqual(cancelled_count, 1) + + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metrics_before_test(self, model, reason): + pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)' + metrics = self._get_metrics() + match = re.search(pattern, metrics) + if match: + return int(match.group(1)) + else: + raise Exception(f"Failure metrics for model='{model}' not found") + + def _assert_metrics( + self, model_name, reason, expected_count_increase, initial_count + ): + metrics = self._get_metrics() + # Add initial count + expected count for the the test + expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}' + self.assertIn(expected_metric, metrics) + + # Test queued requests on dynamic batch scheduler can be cancelled + def test_dynamic_batch_scheduler_request_cancellation(self): + model_name = "dynamic_batch" + with concurrent.futures.ThreadPoolExecutor() as pool: + # Saturate the 2 batch slots on the model of 1 instance + saturate_thread_1 = pool.submit( + self._triton.infer, model_name, self._get_inputs(batch_size=1) + ) + saturate_thread_2 = pool.submit( + self._triton.infer, model_name, self._get_inputs(batch_size=1) + ) + time.sleep(2) # ensure the slots are filled + # The next request should be queued + callback, response = self._generate_callback_and_response_pair() + queue_future = self._triton.async_infer( + model_name, self._get_inputs(batch_size=1), callback + ) + time.sleep(2) # ensure the request is queued + self.assertFalse(response["responded"]) + # Cancel the queued request + queue_future.cancel() + time.sleep(2) # ensure the cancellation is delivered + self._assert_response_is_cancelled(response) + # Join saturating thread + saturate_thread_1.result() + saturate_thread_2.result() + + # Test backlogged requests on sequence batch scheduler can be cancelled + def test_sequence_batch_scheduler_backlog_request_cancellation(self): + model_name = "sequence_direct" + initial_metrics_value = self._metrics_before_test(model_name, "CANCELED") + with concurrent.futures.ThreadPoolExecutor() as pool: + # Saturate the single sequence slot + saturate_thread = pool.submit( + self._triton.infer, + model_name, + self._get_inputs(batch_size=1), + sequence_id=1, + sequence_start=True, + ) + time.sleep(2) # ensure the slot is filled + # The next sequence with 2 requests should be on the backlog + backlog_requests = [] + for i in range(2): + callback, response = self._generate_callback_and_response_pair() + backlog_future = self._triton.async_infer( + model_name, + self._get_inputs(batch_size=1), + callback, + sequence_id=2, + sequence_start=(True if i == 0 else False), + ) + backlog_requests.append( + {"future": backlog_future, "response": response} + ) + time.sleep(2) # ensure the sequence is backlogged + self.assertFalse(backlog_requests[0]["response"]["responded"]) + self.assertFalse(backlog_requests[1]["response"]["responded"]) + # Cancelling any backlogged request cancels the entire sequence + backlog_requests[0]["future"].cancel() + time.sleep(2) # ensure the cancellation is delivered + time.sleep(2) # ensure reaper thread has responded + self._assert_response_is_cancelled(backlog_requests[0]["response"]) + self._assert_response_is_cancelled(backlog_requests[1]["response"]) + # Join saturating thread + saturate_thread.result() + expected_count_increase = 2 + self._assert_metrics( + model_name, + "CANCELED", + expected_count_increase, + initial_metrics_value, + ) + + # Test queued requests on direct sequence batch scheduler can be cancelled + def test_direct_sequence_batch_scheduler_request_cancellation(self): + model_name = "sequence_direct" + initial_metrics_value = self._metrics_before_test(model_name, "CANCELED") + self._test_sequence_batch_scheduler_queued_request_cancellation(model_name) + expected_count_increase = 2 + self._assert_metrics( + model_name, + "CANCELED", + expected_count_increase, + initial_metrics_value, + ) + + # Test queued requests on oldest sequence batch scheduler can be cancelled + def test_oldest_sequence_batch_scheduler_request_cancellation(self): + model_name = "sequence_oldest" + self._test_sequence_batch_scheduler_queued_request_cancellation(model_name) + + # Helper function + def _test_sequence_batch_scheduler_queued_request_cancellation(self, model_name): + with concurrent.futures.ThreadPoolExecutor() as pool: + # Start the sequence + start_thread = pool.submit( + self._triton.infer, + model_name, + self._get_inputs(batch_size=1), + sequence_id=1, + sequence_start=True, + ) + time.sleep(2) # ensure the sequence has started + # The next 2 requests should be queued + queue_requests = [] + for i in range(2): + callback, response = self._generate_callback_and_response_pair() + queue_future = self._triton.async_infer( + model_name, self._get_inputs(batch_size=1), callback, sequence_id=1 + ) + queue_requests.append({"future": queue_future, "response": response}) + time.sleep(2) # ensure the requests are queued + self.assertFalse(queue_requests[0]["response"]["responded"]) + self.assertFalse(queue_requests[1]["response"]["responded"]) + # Cancelling any queued request cancels the entire sequence + queue_requests[0]["future"].cancel() + time.sleep(2) # ensure the cancellation is delivered + time.sleep(2) # ensure reaper thread has responded + self._assert_response_is_cancelled(queue_requests[0]["response"]) + self._assert_response_is_cancelled(queue_requests[1]["response"]) + # Join start thread + start_thread.result() + + # Test ensemble scheduler will propagate cancellation request to child + def test_ensemble_scheduler_request_cancellation(self): + model_name = "ensemble_model" + callback, response = self._generate_callback_and_response_pair() + infer_future = self._triton.async_infer( + model_name, self._get_inputs(batch_size=1), callback + ) + time.sleep(2) # ensure the inference has started + self.assertFalse(response["responded"]) + infer_future.cancel() + time.sleep(2) # ensure the cancellation is delivered + self._assert_response_is_cancelled(response) + + # Test cancellation on multiple gRPC streaming sequences + def test_scheduler_streaming_request_cancellation(self): + model_name = "sequence_oldest" + # Start 2 sequences with many requests + callback, response = self._generate_streaming_callback_and_response_pair() + self._triton.start_stream(callback) + for sequence_id in [1, 2]: + sequence_start = True + for request_id in range(16): + self._triton.async_stream_infer( + model_name, + self._get_inputs(batch_size=1), + sequence_id=sequence_id, + sequence_start=sequence_start, + ) + sequence_start = False + time.sleep(2) # ensure the requests are delivered + # Cancelling the stream cancels all requests on the stream + self._triton.stop_stream(cancel_requests=True) + time.sleep(2) # ensure the cancellation is delivered + time.sleep(2) # ensure reaper thread has responded + self._assert_streaming_response_is_cancelled(response) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_request_cancellation/test.sh b/qa/L0_request_cancellation/test.sh new file mode 100755 index 0000000000..0c9ab74086 --- /dev/null +++ b/qa/L0_request_cancellation/test.sh @@ -0,0 +1,221 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 + +# +# Unit tests +# +rm -rf models && mkdir models +mkdir -p models/model/1 && (cd models/model && \ + echo 'name: "model"' >> config.pbtxt && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 64' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_INT32 \n dims: [ 1000 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_INT32 \n dims: [ 1000 ] }]' >> config.pbtxt && \ + echo 'instance_group [{ kind: KIND_CPU }]' >> config.pbtxt) + +SERVER_LOG=server.log +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH ./request_cancellation_test > $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Unit Tests Failed\n***" + cat $SERVER_LOG + RET=1 +fi + +# +# gRPC cancellation tests +# +rm -rf models && mkdir models +mkdir -p models/custom_identity_int32/1 && (cd models/custom_identity_int32 && \ + echo 'name: "custom_identity_int32"' >> config.pbtxt && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 1024' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo 'instance_group [{ kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "10000" } }]' >> config.pbtxt) + +for TEST_CASE in "test_grpc_async_infer" "test_grpc_stream_infer" "test_aio_grpc_async_infer" "test_aio_grpc_stream_infer" "test_grpc_async_infer_cancellation_at_step_start"; do + + TEST_LOG="./grpc_cancellation_test.$TEST_CASE.log" + SERVER_LOG="grpc_cancellation_test.$TEST_CASE.server.log" + if [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_at_step_start" ]; then + export TRITONSERVER_DELAY_GRPC_PROCESS=5000 + fi + + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + python grpc_cancellation_test.py GrpcCancellationTest.$TEST_CASE > $TEST_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** gRPC Cancellation Tests Failed on $TEST_CASE\n***" + cat $TEST_LOG + RET=1 + fi + grep "Cancellation notification received for" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Cancellation not received by server on $TEST_CASE\n***" + cat $SERVER_LOG + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + if [ "$TEST_CASE" == "test_grpc_async_infer_cancellation_at_step_start" ]; then + unset TRITONSERVER_DELAY_GRPC_PROCESS + fi +done + +# +# End-to-end scheduler tests +# +rm -rf models && mkdir models +mkdir -p models/dynamic_batch/1 && (cd models/dynamic_batch && \ + echo 'name: "dynamic_batch"' >> config.pbtxt && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 2' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'instance_group [{ count: 1 \n kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'dynamic_batching { max_queue_delay_microseconds: 600000 }' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "6000" } }]' >> config.pbtxt) +mkdir -p models/sequence_direct/1 && (cd models/sequence_direct && \ + echo 'name: "sequence_direct"' >> config.pbtxt && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 1' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'instance_group [{ count: 1 \n kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'sequence_batching { direct { } \n max_sequence_idle_microseconds: 6000000 }' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "6000" } }]' >> config.pbtxt) +mkdir -p models/sequence_oldest/1 && (cd models/sequence_oldest && \ + echo 'name: "sequence_oldest"' >> config.pbtxt && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 1' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'instance_group [{ count: 1 \n kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'sequence_batching { oldest { max_candidate_sequences: 1 } \n max_sequence_idle_microseconds: 6000000 }' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "6000" } }]' >> config.pbtxt) +mkdir -p models/ensemble_model/1 && (cd models/ensemble_model && \ + echo 'name: "ensemble_model"' >> config.pbtxt && \ + echo 'platform: "ensemble"' >> config.pbtxt && \ + echo 'max_batch_size: 1' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo 'ensemble_scheduling { step [' >> config.pbtxt && \ + echo -e '{ model_name: "dynamic_batch" \n model_version: -1 \n input_map { key: "INPUT0" \n value: "INPUT0" } \n output_map { key: "OUTPUT0" \n value: "out" } },' >> config.pbtxt && \ + echo -e '{ model_name: "dynamic_batch" \n model_version: -1 \n input_map { key: "INPUT0" \n value: "out" } \n output_map { key: "OUTPUT0" \n value: "OUTPUT0" } }' >> config.pbtxt && \ + echo '] }' >> config.pbtxt) + +TEST_LOG="scheduler_test.log" +SERVER_LOG="./scheduler_test.server.log" + +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python scheduler_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Scheduler Tests Failed\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# +# Implicit state tests +# +rm -rf models && mkdir models +mkdir -p models/sequence_state/1 && (cd models/sequence_state && \ + cp ../../implicit_state_model/config.pbtxt . && \ + cp ../../implicit_state_model/model.pt 1) + +TEST_LOG="implicit_state_test.log" +SERVER_LOG="implicit_state_test.server.log" + +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +SERVER_LOG=$SERVER_LOG python implicit_state_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Implicit State Tests Failed\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_response_cache/ensemble_cache_test.py b/qa/L0_response_cache/ensemble_cache_test.py new file mode 100755 index 0000000000..96b959cc8e --- /dev/null +++ b/qa/L0_response_cache/ensemble_cache_test.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import sys + +sys.path.append("../common") +sys.path.append("../clients") +import logging +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import * + +RESPONSE_CACHE_PATTERN = "response_cache" +RESPONSE_CACHE_CONFIG = "response_cache {\n enable:true\n}\n" + + +class EnsembleCacheTest(tu.TestResultCollector): + def setUp(self): + self.triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + self.ensemble_model = "simple_graphdef_float32_float32_float32" + self.composing_model = "graphdef_float32_float32_float32" + self.model_directory = os.path.join(os.getcwd(), "models", "ensemble_models") + self.ensemble_config_file = os.path.join( + self.model_directory, self.ensemble_model, "config.pbtxt" + ) + self.composing_config_file = os.path.join( + self.model_directory, self.composing_model, "config.pbtxt" + ) + input0_data = np.ones((1, 16), dtype=np.float32) + input1_data = np.ones((1, 16), dtype=np.float32) + self.input_tensors = [ + grpcclient.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + grpcclient.InferInput( + "INPUT1", input1_data.shape, np_to_triton_dtype(input0_data.dtype) + ), + ] + self.input_tensors[0].set_data_from_numpy(input0_data) + self.input_tensors[1].set_data_from_numpy(input1_data) + + def _update_config(self, config_file, config_pattern, config_to_add): + # Utility function to update config files as per testcase + with open(config_file, "r") as f: + config_data = f.read() + if config_pattern not in config_data: + with open(config_file, "w") as f: + config_data += config_to_add + f.write(config_data) + + def _remove_config(self, config_file, config_to_remove): + # Utility function to remove extra added config from the config files + with open(config_file, "r") as f: + config_data = f.read() + updated_config_data = re.sub(config_to_remove, "", config_data) + with open(config_file, "w") as f: + f.write(updated_config_data) + + def _reset_config_files(self): + # Utility function to reset all config files to original + self._remove_config(self.ensemble_config_file, RESPONSE_CACHE_CONFIG) + self._remove_config(self.composing_config_file, RESPONSE_CACHE_CONFIG) + + def _run_ensemble(self): + # Run the ensemble pipeline and validate output + output = self.triton_client.infer( + model_name=self.ensemble_model, inputs=self.input_tensors + ) + self.assertIsNotNone( + output, + f"Unexpected error: Inference result is None for model '{self.ensemble_model}'. Expected non-null output.", + ) + output0 = output.as_numpy("OUTPUT0") + output1 = output.as_numpy("OUTPUT1") + outputs = [output0, output1] + return outputs + + def _get_model_statistics(self, model): + # Get the stats for the requested model + model_stats = self.triton_client.get_inference_statistics( + model_name=model, as_json=True + ) + + """ + The models used have two versions, version 1 and version 3. + Since, model_version is set to -1 in config.pbtxt, the highest version is loaded + which is version 3. + model_stats has inference stats for version 1 at index 0 and inference stats for version 3 at index 1. + """ + return model_stats["model_stats"][1]["inference_stats"] + + def _run_inference_and_validate(self, model): + """ + Helper function that takes model as a parameter to verify the corresponding model's stats + The passed model is composing model for test case `test_ensemble_composing_model_cache_enabled` + For other testcases, the top-level ensemble model stats are verified. + * loads the simple_graphdef_float32_float32_float32 and graphdef_float32_float32_float32 + and verifies if they are loaded properly. + * Checks the initial statistics of the model passed in the parameter + Expected - baseline statistics to be all empty metrics since + * Calls the run_ensemble function to run the ensemble pipeline. + * Verifies the stats after first inference. Expected single cache miss. + * Calls the run_ensemble function to run the ensemble pipeline again. + * Checks if returned output is equal to th output of first inference. + """ + self.triton_client.load_model(self.ensemble_model) + self.assertTrue( + self.triton_client.is_model_ready(self.ensemble_model), + f"Failed to load ensemble model '{self.ensemble_model}'", + ) + self.triton_client.load_model(self.composing_model) + self.assertTrue( + self.triton_client.is_model_ready(self.composing_model), + f"Failed to load composing model '{self.composing_model}'", + ) + + model_stats_initial = self._get_model_statistics(model) + self.assertNotIn( + "count", + model_stats_initial["success"], + f"No inference stats expected initially for model '{model}'", + ) + + inference_output = self._run_ensemble() + model_stats = self._get_model_statistics(model) + self.assertIn( + "count", model_stats["success"], f"Failed inference for model '{model}'" + ) + self.assertIn( + "count", + model_stats["cache_miss"], + f"No cache miss recorded for model '{model}', expected exactly one cache miss", + ) + self.assertEqual( + model_stats["cache_miss"]["count"], + "1", + f"Expected exactly one cache miss in model '{model}', found {model_stats['cache_miss']['count']}", + ) + + cached_output = self._run_ensemble() + self.assertTrue( + np.array_equal(inference_output, cached_output), + f"Cache response does not match actual inference output for model '{model}'", + ) + + def test_ensemble_top_level_response_cache(self): + """ + Test top level response caching when response cache enabled only in + ensemble model's config file. + Expected result: One cache hit in ensemble model stats. No cache related metric counts in + composing model stats. + """ + self._update_config( + self.ensemble_config_file, RESPONSE_CACHE_PATTERN, RESPONSE_CACHE_CONFIG + ) + self._run_inference_and_validate(self.ensemble_model) + ensemble_model_stats = self._get_model_statistics(self.ensemble_model) + expected_cache_hit_count = "1" + actual_cache_hit_count = ensemble_model_stats["cache_hit"]["count"] + self.assertIn( + "count", + ensemble_model_stats["success"], + f"Failed inference recorded for ensemble model '{self.ensemble_model}'. Expected successful inference.", + ) + self.assertIn( + "count", + ensemble_model_stats["cache_hit"], + f"No cache hit recorded for ensemble model '{self.ensemble_model}'. Expected exactly one cache hit.", + ) + self.assertEqual( + actual_cache_hit_count, + expected_cache_hit_count, + f"Unexpected number of cache hits recorded for ensemble model '{self.ensemble_model}'. Expected exactly one cache hit.", + ) + + def test_ensemble_all_models_cache_enabled(self): + """ + Test top level response caching when response cache enabled in + all the models. + Expected result: One cache hit in ensemble model stats. No cache hit in composing model stats. + """ + self._update_config( + self.ensemble_config_file, RESPONSE_CACHE_PATTERN, RESPONSE_CACHE_CONFIG + ) + self._update_config( + self.composing_config_file, RESPONSE_CACHE_PATTERN, RESPONSE_CACHE_CONFIG + ) + self._run_inference_and_validate(self.ensemble_model) + ensemble_model_stats = self._get_model_statistics(self.ensemble_model) + composing_model_stats = self._get_model_statistics(self.composing_model) + expected_cache_hit_count = "1" + actual_cache_hit_count = ensemble_model_stats["cache_hit"]["count"] + self.assertIn( + "count", + ensemble_model_stats["success"], + f"Failed inference recorded for ensemble model '{self.ensemble_model}'. Expected successful inference.", + ) + self.assertIn( + "count", + ensemble_model_stats["cache_hit"], + f"No cache hit recorded for ensemble model '{self.ensemble_model}'. Expected exactly one cache hit.", + ) + self.assertNotIn( + "count", + composing_model_stats["cache_hit"], + f"Unexpected cache hit recorded for composing model '{self.composing_model}'. Expected top-level response in cache for ensemble model '{self.ensemble_model}'.", + ) + self.assertEqual( + actual_cache_hit_count, + expected_cache_hit_count, + f"Unexpected number of cache hits recorded for ensemble model '{self.ensemble_model}'. Expected exactly one cache hit.", + ) + + def test_ensemble_composing_model_cache_enabled(self): + """ + Test caching behavior when response cache enabled only in + composing model's config file. + Expected result: One cache hit in composing model stats. No cache related metric counts in + ensemble model stats. + """ + self._update_config( + self.composing_config_file, RESPONSE_CACHE_PATTERN, RESPONSE_CACHE_CONFIG + ) + self._run_inference_and_validate(self.composing_model) + ensemble_model_stats = self._get_model_statistics(self.ensemble_model) + composing_model_stats = self._get_model_statistics(self.composing_model) + self.assertIn( + "count", + composing_model_stats["success"], + f"Failed inference recorded for ensemble model '{self.composing_model}'. Expected successful inference.", + ) + self.assertIn( + "count", + composing_model_stats["cache_hit"], + f"No cache hit recorded for ensemble model '{self.composing_model}'. Expected exactly one cache hit.", + ) + self.assertNotIn( + "count", + ensemble_model_stats["cache_hit"], + f"Unexpected number of cache hits recorded for ensemble model '{self.ensemble_model}'. Expected empty cache metrics", + ) + + def test_ensemble_cache_insertion_failure(self): + """ + Test cache insertion failure with cache enabled in + ensemble model's config file. + Expected result: Two cache miss in ensemble model stats indicating request/response not inserted into cache + Reason: The data (input tensors, output tensors and other model information) to be inserted in cache is bigger cache size. + """ + self._update_config( + self.ensemble_config_file, RESPONSE_CACHE_PATTERN, RESPONSE_CACHE_CONFIG + ) + self._run_inference_and_validate(self.ensemble_model) + ensemble_model_stats = self._get_model_statistics(self.ensemble_model) + expected_cache_miss_count = "2" + actual_cache_miss_count = ensemble_model_stats["cache_miss"]["count"] + self.assertIn( + "count", + ensemble_model_stats["success"], + f"Failed inference recorded for ensemble model '{self.ensemble_model}'. Expected successful inference.", + ) + self.assertNotIn( + "count", + ensemble_model_stats["cache_hit"], + f"No cache hit recorded for ensemble model '{self.ensemble_model}'. Expected exactly one cache hit.", + ) + self.assertIn( + "count", + ensemble_model_stats["cache_miss"], + f"No cache miss recorded in ensemble model '{self.ensemble_model}'. Expected cache miss.", + ) + self.assertEqual( + actual_cache_miss_count, + expected_cache_miss_count, + f"Unexpected number of cache misses recorded in ensemble model '{self.ensemble_model}'. Expected exactly {expected_cache_miss_count} cache misses for two inference requests, but found {actual_cache_miss_count}.", + ) + + def tearDown(self): + self._reset_config_files() + self.triton_client.close() + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stderr) + unittest.main() diff --git a/qa/L0_response_cache/models/decoupled_cache/config.pbtxt b/qa/L0_response_cache/models/decoupled_cache/config.pbtxt new file mode 100644 index 0000000000..c243e72861 --- /dev/null +++ b/qa/L0_response_cache/models/decoupled_cache/config.pbtxt @@ -0,0 +1,49 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +model_transaction_policy { + decoupled: True +} +response_cache { + enable: True +} diff --git a/qa/L0_response_cache/models/identity_cache/config.pbtxt b/qa/L0_response_cache/models/identity_cache/config.pbtxt new file mode 100644 index 0000000000..7ba5cf2afb --- /dev/null +++ b/qa/L0_response_cache/models/identity_cache/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "identity" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +response_cache { + enable: True +} diff --git a/qa/L0_response_cache/response_cache_test b/qa/L0_response_cache/response_cache_test new file mode 100755 index 0000000000..d3807fac01 Binary files /dev/null and b/qa/L0_response_cache/response_cache_test differ diff --git a/qa/L0_response_cache/test.sh b/qa/L0_response_cache/test.sh new file mode 100755 index 0000000000..54f35dfeaf --- /dev/null +++ b/qa/L0_response_cache/test.sh @@ -0,0 +1,391 @@ +#!/bin/bash +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +RET=0 + +TEST_LOG="./response_cache_test.log" +UNIT_TEST="./response_cache_test --gtest_output=xml:response_cache.report.xml" +export CUDA_VISIBLE_DEVICES=0 + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "${REPO_VERSION}" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi +# Only localhost supported in this test for now, but in future could make +# use of a persistent remote redis server, or similarly use --replicaof arg. +export TRITON_REDIS_HOST="localhost" +export TRITON_REDIS_PORT="6379" +REDIS_LOG="./redis-server.unit_tests.log" +ENSEMBLE_CACHE_TEST_PY="./ensemble_cache_test.py" +SERVER=/opt/tritonserver/bin/tritonserver +CLIENT_LOG="./client.log" +TEST_RESULT_FILE='test_results.txt' +SERVER_LOG=./inference_server.log +RESET_CONFIG_FUNCTION="_reset_config_files" +CACHE_SIZE=10840 +source ../common/util.sh + +MODEL_DIR="${PWD}/models" +ENSEMBLE_MODEL_DIR="${MODEL_DIR}/ensemble_models" +ENSEMBLE_CACHE_DECOUPLED="${MODEL_DIR}/ensemble_cache_decoupled" +ENSEMBLE_CACHE_COMPOSING_DECOUPLED="${MODEL_DIR}/ensemble_cache_composing_decoupled" +rm -fr ${ENSEMBLE_MODEL_DIR} && mkdir ${ENSEMBLE_MODEL_DIR} +rm -fr ${ENSEMBLE_CACHE_DECOUPLED} && mkdir ${ENSEMBLE_CACHE_DECOUPLED} +rm -fr ${ENSEMBLE_CACHE_COMPOSING_DECOUPLED} && mkdir ${ENSEMBLE_CACHE_COMPOSING_DECOUPLED} +ENSEMBLE_MODEL="simple_graphdef_float32_float32_float32" +COMPOSING_MODEL="graphdef_float32_float32_float32" + +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository/${ENSEMBLE_MODEL}" "${ENSEMBLE_MODEL_DIR}/${ENSEMBLE_MODEL}" +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_model_repository/${COMPOSING_MODEL}" "${ENSEMBLE_MODEL_DIR}/${COMPOSING_MODEL}" +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository/${ENSEMBLE_MODEL}" "${ENSEMBLE_CACHE_DECOUPLED}/${ENSEMBLE_MODEL}" +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_model_repository/${COMPOSING_MODEL}" "${ENSEMBLE_CACHE_DECOUPLED}/${COMPOSING_MODEL}" +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository/${ENSEMBLE_MODEL}" "${ENSEMBLE_CACHE_COMPOSING_DECOUPLED}/${ENSEMBLE_MODEL}" +cp -r "/data/inferenceserver/${REPO_VERSION}/qa_model_repository/${COMPOSING_MODEL}" "${ENSEMBLE_CACHE_COMPOSING_DECOUPLED}/${COMPOSING_MODEL}" +mkdir -p "${MODEL_DIR}/decoupled_cache/1" +mkdir -p "${MODEL_DIR}/identity_cache/1" + +echo -e "response_cache { enable: True }" >> "${ENSEMBLE_CACHE_DECOUPLED}/${ENSEMBLE_MODEL}/config.pbtxt" +echo -e "model_transaction_policy { decoupled: True }" >> "${ENSEMBLE_CACHE_DECOUPLED}/${ENSEMBLE_MODEL}/config.pbtxt" +echo -e "response_cache { enable: True }" >> "${ENSEMBLE_CACHE_COMPOSING_DECOUPLED}/${ENSEMBLE_MODEL}/config.pbtxt" +echo -e "model_transaction_policy { decoupled: True }" >> "${ENSEMBLE_CACHE_COMPOSING_DECOUPLED}/${COMPOSING_MODEL}/config.pbtxt" + +rm -fr *.log + +function install_redis() { + ## Install redis if not already installed + if ! command -v redis-server >/dev/null 2>&1; then + apt update -y && apt install -y redis + fi +} + +function start_redis() { + # Run redis server in background + redis-server \ + --daemonize yes \ + --port "${TRITON_REDIS_PORT}" \ + --logfile "${REDIS_LOG}" \ + --loglevel debug + + # Check redis server is running + REDIS_PING_RESPONSE=$(redis-cli -h ${TRITON_REDIS_HOST} -p ${TRITON_REDIS_PORT} ping) + if [ "${REDIS_PING_RESPONSE}" == "PONG" ]; then + echo "Redis successfully started in background" + else + echo -e "\n***\n*** Failed: Redis server did not start successfully\n***" + RET=1 + fi +} + +function stop_redis() { + echo "Stopping Redis server..." + redis-cli -h "${TRITON_REDIS_HOST}" -p "${TRITON_REDIS_PORT}" shutdown || true + echo "Redis server shutdown" +} + +function set_redis_auth() { + # NOTE: Per-user auth [Access Control List (ACL)] is only supported in + # Redis >= 6.0 and is more comprehensive in what can be configured. + # For simplicity and wider range of Redis version support, use + # server-wide password via "requirepass" for now. + redis-cli -h "${TRITON_REDIS_HOST}" -p "${TRITON_REDIS_PORT}" config set requirepass "${REDIS_PW}" + export REDISCLI_AUTH="${REDIS_PW}" +} + +function unset_redis_auth() { + # Authenticate implicitly via REDISCLI_AUTH env var, then unset password/var + redis-cli -h "${TRITON_REDIS_HOST}" -p "${TRITON_REDIS_PORT}" config set requirepass "" + unset REDISCLI_AUTH +} + +# UNIT TESTS +set +e + +# Unit tests currently run for both Local and Redis cache implementations +# by default. However, we could break out the unit tests for each +# into separate runs gtest filters if needed in the future: +# - `${UNIT_TEST} --gtest_filter=*Local*` +# - `${UNIT_TEST} --gtest_filter=*Redis*` +install_redis +# Stop any existing redis server first for good measure +stop_redis +start_redis +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH $UNIT_TEST >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $TEST_LOG + echo -e "\n***\n*** Response Cache Unit Test Failed\n***" + RET=1 +fi +stop_redis +set -e + +# SERVER TESTS +function check_server_success_and_kill { + if [ "${SERVER_PID}" == "0" ]; then + echo -e "\n***\n*** Failed to start ${SERVER}\n***" + cat ${SERVER_LOG} + RET=1 + else + kill ${SERVER_PID} + wait ${SERVER_PID} + fi +} + +function check_server_expected_failure { + EXPECTED_MESSAGE="${1}" + if [ "${SERVER_PID}" != "0" ]; then + echo -e "\n***\n*** Failed: ${SERVER} started successfully when it was expected to fail\n***" + cat ${SERVER_LOG} + RET=1 + + kill ${SERVER_PID} + wait ${SERVER_PID} + else + # Check that server fails with the correct error message + set +e + grep -i "${EXPECTED_MESSAGE}" ${SERVER_LOG} + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed: Expected [${EXPECTED_MESSAGE}] error message in output\n***" + cat $SERVER_LOG + RET=1 + fi + set -e + fi +} + +# DECOUPLED MODEL TESTS +function check_server_failure_decoupled_model { + MODEL_REPOSITORY="${1}" + MODEL="${2}" + EXTRA_ARGS="--model-control-mode=explicit --load-model=${MODEL}" + SERVER_ARGS="--model-repository=${MODEL_REPOSITORY} --cache-config local,size=10480 ${EXTRA_ARGS}" + + rm -f ${SERVER_LOG} + run_server + if [ "${SERVER_PID}" != "0" ]; then + echo -e "\n***\n*** Failed: ${SERVER} started successfully when it was expected to fail\n***" + cat ${SERVER_LOG} + RET=1 + + kill ${SERVER_PID} + wait ${SERVER_PID} + else + # Check that server fails with the correct error message + set +e + grep -i "response cache does not currently support" ${SERVER_LOG} | grep -i "decoupled" + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed: Expected response cache / decoupled mode error message in output\n***" + cat ${SERVER_LOG} + RET=1 + fi + set -e + fi +} + +# ENSEMBLE CACHE TESTS +function test_response_cache_ensemble_model { + TESTCASE="${1}" + ERROR_MESSAGE="${2}" + SERVER_ARGS="--model-repository=${ENSEMBLE_MODEL_DIR} --cache-config local,size=${CACHE_SIZE} --model-control-mode=explicit" + run_server + set +e + python ${ENSEMBLE_CACHE_TEST_PY} ${TESTCASE} >> ${CLIENT_LOG} 2>&1 + if [ $? -ne 0 ]; then + RET=1 + else + check_test_results ${TEST_RESULT_FILE} 1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG} + echo -e ${ERROR_MESSAGE} + RET=1 + fi + fi + + if [ "${TESTCASE}" = "EnsembleCacheTest.test_ensemble_cache_insertion_failure" ]; then + # Check for the error message in the log file + set +e + grep -i "Failed to insert key" "${SERVER_LOG}" + if [ $? -ne 0 ]; then + echo "\n***\n*** Failed: Cache insertion successful when it was expected to fail\n***" + RET=1 + fi + set -e + fi + set -e + check_server_success_and_kill +} + +# Check that server fails to start for a "decoupled" model with cache enabled +check_server_failure_decoupled_model ${MODEL_DIR} "decoupled_cache" + +# Test with model expected to load successfully +EXTRA_ARGS="--model-control-mode=explicit --load-model=identity_cache" + +# Test old cache config method +# --response-cache-byte-size must be non-zero to test models with cache enabled +SERVER_ARGS="--model-repository=${MODEL_DIR} --response-cache-byte-size=8192 ${EXTRA_ARGS}" +run_server +check_server_success_and_kill + +# Test new cache config method +SERVER_ARGS="--model-repository=${MODEL_DIR} --cache-config=local,size=8192 ${EXTRA_ARGS}" +run_server +check_server_success_and_kill + +# Test that specifying multiple cache types is not supported and should fail +SERVER_ARGS="--model-repository=${MODEL_DIR} --cache-config=local,size=8192 --cache-config=redis,key=value ${EXTRA_ARGS}" +run_server +check_server_expected_failure "multiple cache configurations" + +# Test that specifying both config styles is incompatible and should fail +SERVER_ARGS="--model-repository=${MODEL_DIR} --response-cache-byte-size=12345 --cache-config=local,size=67890 ${EXTRA_ARGS}" +run_server +check_server_expected_failure "incompatible flags" + +## Redis Cache CLI tests +REDIS_ENDPOINT="--cache-config redis,host=${TRITON_REDIS_HOST} --cache-config redis,port=${TRITON_REDIS_PORT}" +REDIS_LOG="./redis-server.cli_tests.log" +start_redis + +# Test simple redis cache config succeeds +SERVER_ARGS="--model-repository=${MODEL_DIR} ${REDIS_ENDPOINT} ${EXTRA_ARGS}" +run_server +check_server_success_and_kill + +# Test triton fails to initialize if it can't connect to redis cache +SERVER_ARGS="--model-repository=${MODEL_DIR} --cache-config=redis,host=localhost --cache-config=redis,port=nonexistent ${EXTRA_ARGS}" +run_server +check_server_expected_failure "Failed to connect to Redis: Connection refused" + +# Test triton fails to initialize if it can't resolve host for redis cache +SERVER_ARGS="--model-repository=${MODEL_DIR} --cache-config=redis,host=nonexistent --cache-config=redis,port=nonexistent ${EXTRA_ARGS}" +run_server +# Either of these errors can be returned for bad hostname, so check for either. +MSG1="Temporary failure in name resolution" +MSG2="Name or service not known" +check_server_expected_failure "${MSG1}\|${MSG2}" + +# Test triton fails to initialize if minimum required args (host & port) not all provided +SERVER_ARGS="--model-repository=${MODEL_DIR} --cache-config=redis,port=${TRITON_REDIS_HOST} ${EXTRA_ARGS}" +run_server +check_server_expected_failure "Must at a minimum specify" + +## Redis Authentication tests + +# Automatically provide auth via REDISCLI_AUTH env var when set: https://redis.io/docs/ui/cli/ +REDIS_PW="redis123!" +set_redis_auth + +### Credentials via command-line + +# Test simple redis authentication succeeds with correct credentials +REDIS_CACHE_AUTH="--cache-config redis,password=${REDIS_PW}" +SERVER_ARGS="--model-repository=${MODEL_DIR} ${REDIS_ENDPOINT} ${REDIS_CACHE_AUTH} ${EXTRA_ARGS}" +run_server +check_server_success_and_kill + +# Test simple redis authentication fails with wrong credentials +REDIS_CACHE_AUTH="--cache-config redis,password=wrong" +SERVER_ARGS="--model-repository=${MODEL_DIR} ${REDIS_ENDPOINT} ${REDIS_CACHE_AUTH} ${EXTRA_ARGS}" +run_server +check_server_expected_failure "WRONGPASS" + +# Test simple redis authentication fails with no credentials +SERVER_ARGS="--model-repository=${MODEL_DIR} ${REDIS_ENDPOINT} ${EXTRA_ARGS}" +run_server +check_server_expected_failure "NOAUTH Authentication required" + +### Credentials via environment variables + +# Test simple redis authentication succeeds with password-only via env vars +# No username means use "default" as the username +unset TRITONCACHE_REDIS_USERNAME +export TRITONCACHE_REDIS_PASSWORD="${REDIS_PW}" +SERVER_ARGS="--model-repository=${MODEL_DIR} ${REDIS_ENDPOINT} ${EXTRA_ARGS}" +run_server +check_server_success_and_kill + +# Test simple redis authentication succeeds with correct user and password via env vars +export TRITONCACHE_REDIS_USERNAME="default" +export TRITONCACHE_REDIS_PASSWORD="${REDIS_PW}" +SERVER_ARGS="--model-repository=${MODEL_DIR} ${REDIS_ENDPOINT} ${EXTRA_ARGS}" +run_server +check_server_success_and_kill + +# Test simple redis authentication fails with wrong credentials via env vars +export TRITONCACHE_REDIS_PASSWORD="wrong" +SERVER_ARGS="--model-repository=${MODEL_DIR} ${REDIS_ENDPOINT} ${EXTRA_ARGS}" +run_server +check_server_expected_failure "WRONGPASS" +unset TRITONCACHE_REDIS_USERNAME +unset TRITONCACHE_REDIS_PASSWORD +# Clean up redis server +unset_redis_auth +stop_redis + +# Test ensemble model with cache and decoupled mode enabled +check_server_failure_decoupled_model ${ENSEMBLE_CACHE_DECOUPLED} ${ENSEMBLE_MODEL} + +# Test ensemble model with cache enabled and decoupled mode enabled in composing model +check_server_failure_decoupled_model ${ENSEMBLE_CACHE_COMPOSING_DECOUPLED} ${ENSEMBLE_MODEL} + +# Test ensemble model with response cache enabled +TEST_NAME="EnsembleCacheTest.test_ensemble_top_level_response_cache" +ERROR_MESSAGE="\n***\n*** Failed: Expected top level response caching\n***" +test_response_cache_ensemble_model "${TEST_NAME}" "${ERROR_MESSAGE}" + +# Test ensemble model with cache enabled in all models +TEST_NAME="EnsembleCacheTest.test_ensemble_all_models_cache_enabled" +ERROR_MESSAGE="\n***\n*** Failed: Expected cache to return top-level request's response\n***" +test_response_cache_ensemble_model "${TEST_NAME}" "${ERROR_MESSAGE}" + +# Test composing model cache enabled +TEST_NAME="EnsembleCacheTest.test_ensemble_composing_model_cache_enabled" +ERROR_MESSAGE="\n***\n*** Failed: Expected only composing model's input/output to be inserted in cache\n***" +test_response_cache_ensemble_model "${TEST_NAME}" "${ERROR_MESSAGE}" + +# Test cache insertion failure +TEST_NAME="EnsembleCacheTest.test_ensemble_cache_insertion_failure" +ERROR_MESSAGE="\n***\n*** Failed: Request added to cache successfully when it was expected to fail\n***" +CACHE_SIZE=200 +test_response_cache_ensemble_model "${TEST_NAME}" "${ERROR_MESSAGE}" + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET + diff --git a/qa/L0_response_statistics/response_statistics_test.py b/qa/L0_response_statistics/response_statistics_test.py new file mode 100755 index 0000000000..64f2d4fb68 --- /dev/null +++ b/qa/L0_response_statistics/response_statistics_test.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import unittest + +import numpy as np +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient + + +class TestResponseStatistics(unittest.TestCase): + def setUp(self): + self._model_name = "set_by_test_case" + self._min_infer_delay_ns = 0 + self._min_output_delay_ns = 0 + self._min_cancel_delay_ns = 0 + self._number_of_fail_responses = 0 + self._number_of_empty_responses = 0 + self._statistics_counts = [] + self._grpc_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + self._http_client = httpclient.InferenceServerClient("localhost:8000") + + # Return a coupled (callback, response) pair for gRPC stream infer. + def _generate_streaming_callback_and_response_pair(self): + # [{"result": result, "error": error}, ...] + response = [] + + def callback(result, error): + response.append({"result": result, "error": error}) + + return callback, response + + # Send an infer request and return its responses. 'number_of_responses' is the sum + # of success, fail and empty responses the model should return for this request. + # 'cancel_at_response_size' will cancel the stream when the number of responses + # received equals the size, set to None if cancellation is not required. This + # function waits until all success and fail responses are received, or cancelled. + def _stream_infer(self, number_of_responses, cancel_at_response_size=None): + callback, responses = self._generate_streaming_callback_and_response_pair() + self._grpc_client.start_stream(callback) + input_data = np.array([number_of_responses], dtype=np.int32) + inputs = [grpcclient.InferInput("IN", input_data.shape, "INT32")] + inputs[0].set_data_from_numpy(input_data) + outputs = [grpcclient.InferRequestedOutput("OUT")] + self._grpc_client.async_stream_infer( + model_name=self._model_name, inputs=inputs, outputs=outputs + ) + if cancel_at_response_size is None: + # poll until all expected responses are received + while len(responses) < ( + number_of_responses - self._number_of_empty_responses + ): + time.sleep(0.1) + self._grpc_client.stop_stream(cancel_requests=False) + else: + # poll until cancellation response size is reached + while len(responses) < cancel_at_response_size: + time.sleep(0.1) + self._grpc_client.stop_stream(cancel_requests=True) + return responses + + # Update expected statistics counts for the response at 'current_index'. + # 'number_of_responses' is the sum of success, fail and empty responses expected + # from this inference request. 'cancel_at_index' is the index at which the request + # should be cancelled. + def _update_statistics_counts( + self, current_index, number_of_responses, cancel_at_index + ): + if current_index >= len(self._statistics_counts): + self._statistics_counts.append( + { + "compute_infer": 0, + "compute_output": 0, + "success": 0, + "fail": 0, + "empty_response": 0, + "cancel": 0, + } + ) + if current_index == cancel_at_index: + # cancel + self._statistics_counts[current_index]["cancel"] += 1 + elif ( + current_index + + self._number_of_fail_responses + + self._number_of_empty_responses + < number_of_responses + ): + # success + self._statistics_counts[current_index]["compute_infer"] += 1 + self._statistics_counts[current_index]["compute_output"] += 1 + self._statistics_counts[current_index]["success"] += 1 + elif current_index + self._number_of_empty_responses < number_of_responses: + # fail + self._statistics_counts[current_index]["compute_infer"] += 1 + self._statistics_counts[current_index]["compute_output"] += 1 + self._statistics_counts[current_index]["fail"] += 1 + else: + # empty + self._statistics_counts[current_index]["compute_infer"] += 1 + self._statistics_counts[current_index]["empty_response"] += 1 + + # Check the 'response_stats' at 'current_index' for 'stats_name' is valid. + def _check_statistics_count_and_duration( + self, response_stats, current_index, stats_name + ): + expected_count = self._statistics_counts[current_index][stats_name] + if stats_name == "compute_infer" or stats_name == "empty_response": + delay_ns = self._min_infer_delay_ns + elif stats_name == "compute_output": + delay_ns = self._min_output_delay_ns + elif stats_name == "cancel": + delay_ns = self._min_cancel_delay_ns + else: # success or fail + delay_ns = self._min_infer_delay_ns + self._min_output_delay_ns + if delay_ns == 0: + upper_bound_ns = 10000000 * expected_count + lower_bound_ns = 0 + else: + upper_bound_ns = 1.1 * delay_ns * expected_count + lower_bound_ns = 0.9 * delay_ns * expected_count + stats = response_stats[str(current_index)][stats_name] + self.assertEqual(stats["count"], expected_count) + self.assertLessEqual(stats["ns"], upper_bound_ns) + self.assertGreaterEqual(stats["ns"], lower_bound_ns) + + # Fetch and return the response statistics from both gRPC and HTTP endpoints, and + # check they are equivalent before returning. + def _get_response_statistics(self): + # http response statistics + statistics_http = self._http_client.get_inference_statistics( + model_name=self._model_name + ) + model_stats_http = statistics_http["model_stats"][0] + self.assertEqual(model_stats_http["name"], self._model_name) + response_stats_http = model_stats_http["response_stats"] + # grpc response statistics + statistics_grpc = self._grpc_client.get_inference_statistics( + model_name=self._model_name, as_json=True + ) + model_stats_grpc = statistics_grpc["model_stats"][0] + self.assertEqual(model_stats_grpc["name"], self._model_name) + response_stats_grpc = model_stats_grpc["response_stats"] + # check equivalent between http and grpc statistics + self.assertEqual(len(response_stats_http), len(response_stats_grpc)) + for idx, statistics_http in response_stats_http.items(): + self.assertIn(idx, response_stats_grpc) + statistics_grpc = response_stats_grpc[idx] + for name, stats_http in statistics_http.items(): + self.assertIn(name, statistics_grpc) + stats_grpc = statistics_grpc[name] + # normalize gRPC statistics to http + stats_grpc["count"] = ( + int(stats_grpc["count"]) if ("count" in stats_grpc) else 0 + ) + stats_grpc["ns"] = int(stats_grpc["ns"]) if ("ns" in stats_grpc) else 0 + # check equal + self.assertEqual(stats_http, stats_grpc) + return response_stats_http + + # Check the response statistics is valid for a given infer request, providing its + # 'responses', expected 'number_of_responses' and 'cancel_at_index'. + def _check_response_stats( + self, responses, number_of_responses, cancel_at_index=None + ): + response_stats = self._get_response_statistics() + self.assertGreaterEqual(len(response_stats), number_of_responses) + for i in range(number_of_responses): + self._update_statistics_counts(i, number_of_responses, cancel_at_index) + self._check_statistics_count_and_duration( + response_stats, i, "compute_infer" + ) + self._check_statistics_count_and_duration( + response_stats, i, "compute_output" + ) + self._check_statistics_count_and_duration(response_stats, i, "success") + self._check_statistics_count_and_duration(response_stats, i, "fail") + self._check_statistics_count_and_duration( + response_stats, i, "empty_response" + ) + self._check_statistics_count_and_duration(response_stats, i, "cancel") + + # Test response statistics. The statistics must be valid over two or more infers. + def test_response_statistics(self): + self._model_name = "square_int32" + self._min_infer_delay_ns = 400000000 + self._min_output_delay_ns = 200000000 + self._number_of_fail_responses = 2 + self._number_of_empty_responses = 1 + # Send a request that generates 4 responses. + number_of_responses = 4 + responses = self._stream_infer(number_of_responses) + self._check_response_stats(responses, number_of_responses) + # Send a request that generates 6 responses, and make sure the statistics are + # aggregated with the previous request. + number_of_responses = 6 + responses = self._stream_infer(number_of_responses) + self._check_response_stats(responses, number_of_responses) + # Send a request that generates 3 responses, and make sure the statistics are + # aggregated with the previous requests. + number_of_responses = 3 + responses = self._stream_infer(number_of_responses) + self._check_response_stats(responses, number_of_responses) + + # Test response statistics with cancellation. + def test_response_statistics_cancel(self): + self._model_name = "square_int32_slow" + self._min_infer_delay_ns = 1200000000 + self._min_output_delay_ns = 800000000 + self._min_cancel_delay_ns = 400000000 + + # Send a request that generates 4 responses. + number_of_responses = 4 + responses = self._stream_infer(number_of_responses) + self._check_response_stats(responses, number_of_responses) + + # Send a request that generates 4 responses, and cancel on the 3rd response. + # Make sure the statistics are aggregated with the previous request. + responses = self._stream_infer(number_of_responses=4, cancel_at_response_size=1) + # There is an infer and output delay on the 1st and 2nd response, and a cancel + # delay on the 3rd response. + min_total_delay_ns = ( + self._min_infer_delay_ns + self._min_output_delay_ns + ) * 2 + self._min_cancel_delay_ns + # Make sure the inference and cancellation is completed before checking. + time.sleep(min_total_delay_ns * 1.5 / 1000000000) + # The request is cancelled when the 2nd response is computing, so the + # cancellation should be received at the 3rd response (index 2), making a total + # of 3 responses on the statistics. + self._check_response_stats(responses, number_of_responses=3, cancel_at_index=2) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_response_statistics/test.sh b/qa/L0_response_statistics/test.sh new file mode 100755 index 0000000000..b91e3bbde1 --- /dev/null +++ b/qa/L0_response_statistics/test.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 + +rm -rf models && mkdir models +mkdir -p models/square_int32/1 && (cd models/square_int32 && \ + echo 'backend: "square"' >> config.pbtxt && \ + echo 'max_batch_size: 0' >> config.pbtxt && \ + echo 'model_transaction_policy { decoupled: True }' >> config.pbtxt && \ + echo -e 'input [{ name: "IN" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUT" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "CUSTOM_INFER_DELAY_NS" \n value: { string_value: "400000000" } }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "CUSTOM_OUTPUT_DELAY_NS" \n value: { string_value: "200000000" } }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "CUSTOM_FAIL_COUNT" \n value: { string_value: "2" } }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "CUSTOM_EMPTY_COUNT" \n value: { string_value: "1" } }]' >> config.pbtxt) +mkdir -p models/square_int32_slow/1 && (cd models/square_int32_slow && \ + echo 'backend: "square"' >> config.pbtxt && \ + echo 'max_batch_size: 0' >> config.pbtxt && \ + echo 'model_transaction_policy { decoupled: True }' >> config.pbtxt && \ + echo -e 'input [{ name: "IN" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUT" \n data_type: TYPE_INT32 \n dims: [ 1 ] }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "CUSTOM_INFER_DELAY_NS" \n value: { string_value: "1200000000" } }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "CUSTOM_OUTPUT_DELAY_NS" \n value: { string_value: "800000000" } }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "CUSTOM_CANCEL_DELAY_NS" \n value: { string_value: "400000000" } }]' >> config.pbtxt) + +TEST_LOG="response_statistics_test.log" +SERVER_LOG="./response_statistics_test.server.log" + +SERVER_ARGS="--model-repository=`pwd`/models" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python response_statistics_test.py > $TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed response statistics test\n***" + cat $TEST_LOG + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_sagemaker/sagemaker_multi_model_test.py b/qa/L0_sagemaker/sagemaker_multi_model_test.py new file mode 100755 index 0000000000..b2052f6751 --- /dev/null +++ b/qa/L0_sagemaker/sagemaker_multi_model_test.py @@ -0,0 +1,379 @@ +#!/usr/bin/python +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json +import os +import sys +import time +import unittest + +import numpy as np +import requests +import test_util as tu +import tritonclient.http as httpclient + + +class SageMakerMultiModelTest(tu.TestResultCollector): + def setUp(self): + SAGEMAKER_BIND_TO_PORT = os.getenv("SAGEMAKER_BIND_TO_PORT", "8080") + self.url_mme_ = "http://localhost:{}/models".format(SAGEMAKER_BIND_TO_PORT) + + # model_1 setup + self.model1_name = "sm_mme_model_1" + self.model1_url = "/opt/ml/models/123456789abcdefghi/model" + + self.model1_input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + self.model1_expected_output0_data_ = [ + 0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 16, + 18, + 20, + 22, + 24, + 26, + 28, + 30, + ] + self.model1_expected_output1_data_ = [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ] + + self.model1_expected_result_ = { + "model_name": "sm_mme_model_1", + "model_version": "1", + "outputs": [ + { + "name": "OUTPUT0", + "datatype": "INT32", + "shape": [1, 16], + "data": self.model1_expected_output0_data_, + }, + { + "name": "OUTPUT1", + "datatype": "INT32", + "shape": [1, 16], + "data": self.model1_expected_output1_data_, + }, + ], + } + + # model_2 setup + self.model2_name = "sm_mme_model_2" + self.model2_url = "/opt/ml/models/987654321ihgfedcba/model" + + # Output is same as input since this is an identity model + self.model2_input_data_ = [0, 1, 2, 3, 4, 5, 6, 7] + + # ensemble model setup + self.model3_name = "123456789ensemble" + self.model3_url = "/opt/ml/models/123456789ensemble/model" + + def test_sm_0_environment_variables_set(self): + self.assertEqual( + os.getenv("SAGEMAKER_MULTI_MODEL"), + "true", + "Variable SAGEMAKER_MULTI_MODEL must be set to true", + ) + + def test_sm_1_model_load(self): + # Load model_1 + request_body = {"model_name": self.model1_name, "url": self.model1_url} + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) + time.sleep(5) # wait for model to load + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + # Load the same model again, expect a 409 + request_body = {"model_name": self.model1_name, "url": self.model1_url} + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) + time.sleep(5) # wait for model to load + self.assertEqual( + r.status_code, + 409, + "Expected status code 409, received {}".format(r.status_code), + ) + + # Load model_2 + request_body = {"model_name": self.model2_name, "url": self.model2_url} + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) + time.sleep(5) # wait for model to load + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + def test_sm_2_model_list(self): + r = requests.get(self.url_mme_) + time.sleep(3) + expected_response_1 = { + "models": [ + { + "modelName": self.model1_name, + "modelUrl": self.model1_url.rstrip("/model"), + }, + { + "modelName": self.model2_name, + "modelUrl": self.model2_url.rstrip("/model"), + }, + ] + } + expected_response_2 = { + "models": [ + { + "modelName": self.model2_name, + "modelUrl": self.model2_url.rstrip("/model"), + }, + { + "modelName": self.model1_name, + "modelUrl": self.model1_url.rstrip("/model"), + }, + ] + } + + # Returned list response's order is not deterministic + self.assertIn( + r.json(), + [expected_response_1, expected_response_2], + "Expected one of {}, received: {}".format( + [expected_response_1, expected_response_2], r.json() + ), + ) + + def test_sm_3_model_get(self): + get_url = "{}/{}".format(self.url_mme_, self.model1_name) + r = requests.get(get_url) + time.sleep(3) + expected_response = { + "modelName": self.model1_name, + "modelUrl": self.model1_url.rstrip("/model"), + } + self.assertEqual( + r.json(), + expected_response, + "Expected response: {}, received: {}".format(expected_response, r.json()), + ) + + def test_sm_4_model_invoke(self): + # Invoke model_1 + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.model1_input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = {"Content-Type": "application/json"} + invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model1_name) + r = requests.post(invoke_url, data=request_body, headers=headers) + r.raise_for_status() + + self.assertEqual( + self.model1_expected_result_, + r.json(), + "Expected response : {}, received: {}".format( + self.model1_expected_result_, r.json() + ), + ) + + # Invoke model_2 + inputs = [] + outputs = [] + inputs.append( + httpclient.InferInput( + "INPUT0", + [1, 8], + "FP32", + ) + ) + input_data = np.array(self.model2_input_data_, dtype=np.float32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model2_name) + headers = { + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size={}".format( + header_length + ) + } + r = requests.post(invoke_url, data=request_body, headers=headers) + + header_length_prefix = ( + "application/vnd.sagemaker-triton.binary+json;json-header-size=" + ) + header_length_str = r.headers["Content-Type"][len(header_length_prefix) :] + result = httpclient.InferenceServerClient.parse_response_body( + r._content, header_length=int(header_length_str) + ) + + # Get the inference header size so we can locate the output binary data + output_data = result.as_numpy("OUTPUT0") + + for i in range(8): + self.assertEqual( + output_data[0][i], input_data[0][i], "Tensor Value Mismatch" + ) + + def test_sm_5_model_unload(self): + # Unload model_1 + unload_url = "{}/{}".format(self.url_mme_, self.model1_name) + r = requests.delete(unload_url) + time.sleep(3) + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + # Unload model_2 + unload_url = "{}/{}".format(self.url_mme_, self.model2_name) + r = requests.delete(unload_url) + time.sleep(3) + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + # Unload a non-loaded model, expect a 404 + unload_url = "{}/sm_non_loaded_model".format(self.url_mme_) + r = requests.delete(unload_url) + time.sleep(3) + self.assertEqual( + r.status_code, + 404, + "Expected status code 404, received {}".format(r.status_code), + ) + + def test_sm_6_ensemble_model(self): + # Load ensemble model + request_body = {"model_name": self.model3_name, "url": self.model3_url} + headers = { + "Content-Type": "application/json", + "X-Amzn-SageMaker-Target-Model": f"{self.model3_name}", + } + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) + time.sleep(5) # wait for model to load + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + # Invoke ensemble model + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32")) + + # Initialize the data + input_data = np.array(self.model1_input_data_, dtype=np.float32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = {"Content-Type": "application/json"} + invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model3_name) + r = requests.post(invoke_url, data=request_body, headers=headers) + print(f"response: {r.text}") + r.raise_for_status() + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + # Unload ensemble model + unload_url = "{}/{}".format(self.url_mme_, self.model3_name) + r = requests.delete(unload_url, headers=headers) + time.sleep(5) + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_sagemaker/sagemaker_test.py b/qa/L0_sagemaker/sagemaker_test.py new file mode 100755 index 0000000000..6e76a9f0fd --- /dev/null +++ b/qa/L0_sagemaker/sagemaker_test.py @@ -0,0 +1,387 @@ +#!/usr/bin/python +# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json +import os +import sys +import unittest + +import numpy as np +import requests +import test_util as tu +import tritonclient.http as httpclient + + +class SageMakerTest(tu.TestResultCollector): + def setUp(self): + SAGEMAKER_BIND_TO_PORT = os.getenv("SAGEMAKER_BIND_TO_PORT", "8080") + self.url_ = "http://localhost:{}/invocations".format(SAGEMAKER_BIND_TO_PORT) + self.input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + self.expected_output0_data_ = [ + 0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 16, + 18, + 20, + 22, + 24, + 26, + 28, + 30, + ] + self.expected_output1_data_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + self.expected_result_ = { + "model_name": "sm_model", + "model_version": "1", + "outputs": [ + { + "name": "OUTPUT0", + "datatype": "INT32", + "shape": [1, 16], + "data": self.expected_output0_data_, + }, + { + "name": "OUTPUT1", + "datatype": "INT32", + "shape": [1, 16], + "data": self.expected_output1_data_, + }, + ], + } + + def test_direct_inference(self): + request = { + "inputs": [ + { + "name": "INPUT0", + "datatype": "INT32", + "shape": [1, 16], + "data": self.input_data_, + }, + { + "name": "INPUT1", + "datatype": "INT32", + "shape": [1, 16], + "data": self.input_data_, + }, + ] + } + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_, data=json.dumps(request), headers=headers) + r.raise_for_status() + + self.assertEqual( + self.expected_result_, + r.json(), + "Expected response body: {}; got: {}".format( + self.expected_result_, r.json() + ), + ) + + def test_inference_client_generated_request(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + self.assertEqual( + self.expected_result_, + r.json(), + "Expected response body: {}; got: {}".format( + self.expected_result_, r.json() + ), + ) + + def test_inference_client_generated_request_binary(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size={}".format( + header_length + ) + } + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + self.assertEqual( + self.expected_result_, + r.json(), + "Expected response body: {}; got: {}".format( + self.expected_result_, r.json() + ), + ) + + def test_inference_client_generated_response(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + result = httpclient.InferenceServerClient.parse_response_body(r._content) + + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") + for i in range(16): + self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) + self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) + + def test_inference_client_generated_response_binary(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + header_length_prefix = ( + "application/vnd.sagemaker-triton.binary+json;json-header-size=" + ) + header_length_str = r.headers["Content-Type"][len(header_length_prefix) :] + result = httpclient.InferenceServerClient.parse_response_body( + r._content, header_length=int(header_length_str) + ) + + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") + for i in range(16): + self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) + self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) + + def test_malformed_binary_header(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "additional-string/application/vnd.sagemaker-triton.binary+json;json-header-size={}".format( + header_length + ) + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + def test_malformed_binary_header_not_number(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=additional-string{}".format( + header_length + ) + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + def test_malformed_binary_header_negative_number(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=-123" + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + def test_malformed_binary_header_large_number(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=12345" + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_sagemaker/test.sh b/qa/L0_sagemaker/test.sh new file mode 100755 index 0000000000..b5bd07c519 --- /dev/null +++ b/qa/L0_sagemaker/test.sh @@ -0,0 +1,465 @@ +#!/bin/bash +# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' +# Make sure we can safety use symbolic link for SageMaker serve script +if [ -d "/opt/ml/model" ] || [ -L "/opt/ml/model" ]; then + echo -e "Default SageMaker model path must not be used for testing" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +rm -rf models +rm -f *.log +rm -f *.out + +SAGEMAKER_TEST=sagemaker_test.py +SAGEMAKER_MULTI_MODEL_TEST=sagemaker_multi_model_test.py +MULTI_MODEL_UNIT_TEST_COUNT=7 +UNIT_TEST_COUNT=9 +CLIENT_LOG="./client.log" + +DATADIR=/data/inferenceserver/${REPO_VERSION} +ENSEMBLEDIR=/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_LOG="./server.log" +# Link model repository to "/opt/ml/model" +mkdir /opt/ml/ +ln -s `pwd`/models /opt/ml/model +source ../common/util.sh + +mkdir models && \ + cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 models/sm_model && \ + rm -r models/sm_model/2 && rm -r models/sm_model/3 && \ + sed -i "s/onnx_int32_int32_int32/sm_model/" models/sm_model/config.pbtxt + +# Use SageMaker's ping endpoint to check server status +# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on +# success, 1 on failure +function sagemaker_wait_for_server_ready() { + local spid="$1"; shift + local wait_time_secs="${1:-30}"; shift + + WAIT_RET=0 + + ping_address="localhost:8080/ping" + if [ -n "$SAGEMAKER_BIND_TO_PORT" ]; then + ping_address="localhost:${SAGEMAKER_BIND_TO_PORT}/ping" + fi + + local wait_secs=$wait_time_secs + until test $wait_secs -eq 0 ; do + if ! kill -0 $spid; then + echo "=== Server not running." + WAIT_RET=1 + return + fi + + sleep 1; + + set +e + code=`curl -s -w %{http_code} $ping_address` + set -e + if [ "$code" == "200" ]; then + return + fi + + ((wait_secs--)); + done + + echo "=== Timeout $wait_time_secs secs. Server not ready." + WAIT_RET=1 +} + +# Start server with 'serve' script +export SAGEMAKER_TRITON_DEFAULT_MODEL_NAME=sm_model +serve > $SERVER_LOG 2>&1 & +SERVE_PID=$! +# Obtain Triton PID in such way as $! will return the script PID +sleep 1 +SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` +sagemaker_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + exit 1 +fi + +# Ping +set +e +code=`curl -s -w %{http_code} -o ./ping.out localhost:8080/ping` +set -e +if [ "$code" != "200" ]; then + cat ./ping.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +# Inference in default setting +set +e +python $SAGEMAKER_TEST SageMakerTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVE_PID + +# Change SageMaker port +export SAGEMAKER_BIND_TO_PORT=8000 +serve > $SERVER_LOG 2>&1 & +SERVE_PID=$! +# Obtain Triton PID in such way as $! will return the script PID +sleep 1 +SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` +sagemaker_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + exit 1 +fi + +# Inference with the new port +set +e +python $SAGEMAKER_TEST SageMakerTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +unset SAGEMAKER_BIND_TO_PORT + +kill $SERVER_PID +wait $SERVE_PID + +# Set SageMaker safe port range +export SAGEMAKER_SAFE_PORT_RANGE="8081-9000" + +# Start Triton in a similar way to 'serve' script, as 'serve' script can't +# be used to satisfy the setting under test +SAGEMAKER_ARGS="--model-repository=/opt/ml/model" +if [ -n "$SAGEMAKER_BIND_TO_PORT" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-port=${SAGEMAKER_BIND_TO_PORT}" +fi +if [ -n "$SAGEMAKER_SAFE_PORT_RANGE" ]; then + SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-safe-port-range=${SAGEMAKER_SAFE_PORT_RANGE}" +fi + +# Enable HTTP endpoint and expect server fail to start (default port 8000 < 8081) +SERVER_ARGS="--allow-sagemaker=true --allow-grpc false --allow-http true --allow-metrics false \ + --model-control-mode=explicit --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME} \ + $SAGEMAKER_ARGS" +run_server_nowait +sagemaker_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" == "0" ]; then + echo -e "\n***\n*** Expect failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + RET=1 +else + grep "The server cannot listen to HTTP requests at port" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on using disallowed port\n***" + RET=1 + fi +fi + +# Run 'serve' script and expect SageMaker endpoint on default port 8080 (< 8081) +# is working +serve > $SERVER_LOG 2>&1 & +SERVE_PID=$! +# Obtain Triton PID in such way as $! will return the script PID +sleep 1 +SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` + +sagemaker_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + exit 1 +fi + +# Inference with the new port +set +e +python $SAGEMAKER_TEST SageMakerTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +unset SAGEMAKER_SAFE_PORT_RANGE +unset SAGEMAKER_TRITON_DEFAULT_MODEL_NAME + +kill $SERVER_PID +wait $SERVE_PID + +# Test serve with incorrect model name +export SAGEMAKER_TRITON_DEFAULT_MODEL_NAME=incorrect_model_name +serve > $SERVER_LOG 2>&1 & +SERVE_PID=$! +# Obtain Triton PID in such way as $! will return the script PID +sleep 1 +SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` +if [ -n "$SERVER_PID" ]; then + echo -e "\n***\n*** Expect failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + RET=1 +else + grep "ERROR: Directory with provided SAGEMAKER_TRITON_DEFAULT_MODEL_NAME ${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME} does not exist" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on model name and dir name mismatch\n***" + RET=1 + fi +fi + +unset SAGEMAKER_TRITON_DEFAULT_MODEL_NAME + +# Test serve with SAGEMAKER_TRITON_DEFAULT_MODEL_NAME unset, but containing single model directory +serve > $SERVER_LOG 2>&1 & +SERVE_PID=$! +# Obtain Triton PID in such way as $! will return the script PID +sleep 1 +SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` +sagemaker_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + exit 1 +else + grep "WARNING: No SAGEMAKER_TRITON_DEFAULT_MODEL_NAME provided" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected server to start with only existing directory as model.\n***" + RET=1 + fi +fi + +kill $SERVER_PID +wait $SERVE_PID + +# Test unspecified SAGEMAKER_TRITON_DEFAULT_MODEL_NAME for ecs/eks case +SERVER_ARGS="--allow-sagemaker=true --allow-grpc false --allow-http false --allow-metrics false \ + --model-repository `pwd`/models --model-control-mode=explicit --exit-on-error=false" +run_server_nowait +sleep 5 +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +code=`curl -X POST -s -w %{http_code} -o ./invoke.out localhost:8080/invocations --data-raw 'dummy'` +set -e +if [ "$code" == "200" ]; then + cat ./invoke.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "Request for unknown model: 'unspecified_SAGEMAKER_TRITON_DEFAULT_MODEL_NAME' is not found" ./invoke.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected inference to fail with unspecified model error.\n***" + fi +fi + +kill $SERVER_PID +wait $SERVER_PID + +# TODO: Test ensemble backend + +# Run server with invalid model and exit-on-error=false +rm models/sm_model/1/* +SERVER_ARGS="--allow-sagemaker=true --allow-grpc false --allow-http false --allow-metrics false \ + --model-repository `pwd`/models --model-control-mode=explicit --load-model=sm_model \ + --exit-on-error=false" +run_server_nowait +sleep 5 +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Ping and expect error code in SME mode. +set +e +code=`curl -s -w %{http_code} -o ./ping.out localhost:8080/ping` +set -e +if [ "$code" == "200" ]; then + cat ./ping.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# MME begin +# Prepare model repository + +ln -s `pwd`/models /opt/ml/models +# Model path will be of the form /opt/ml/models//model +MODEL1_PATH="models/123456789abcdefghi/model" +MODEL2_PATH="models/987654321ihgfedcba/model" +mkdir -p "${MODEL1_PATH}" +mkdir -p "${MODEL2_PATH}" + +cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32/* ${MODEL1_PATH} && \ + rm -r ${MODEL1_PATH}/2 && rm -r ${MODEL1_PATH}/3 && \ + sed -i "s/onnx_int32_int32_int32/sm_mme_model_1/" ${MODEL1_PATH}/config.pbtxt + +cp -r $DATADIR/qa_identity_model_repository/onnx_zero_1_float32/* ${MODEL2_PATH} && \ + sed -i "s/onnx_zero_1_float32/sm_mme_model_2/" ${MODEL2_PATH}/config.pbtxt + +# Ensemble model +ENSEMBLE_MODEL_PATH="models/123456789ensemble/model" +mkdir -p "${ENSEMBLE_MODEL_PATH}" + +model_name=python_float32_float32_float32 + +mkdir -p ${ENSEMBLE_MODEL_PATH}/${model_name}/1 && \ +cp ../python_models/add_sub/model.py ${ENSEMBLE_MODEL_PATH}/${model_name}/1/. && \ +cp ../python_models/add_sub/config.pbtxt ${ENSEMBLE_MODEL_PATH}/${model_name}/. +(cd ${ENSEMBLE_MODEL_PATH}/${model_name} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + +# Ensemble part +mkdir -p ${ENSEMBLE_MODEL_PATH}/fan_${model_name}/1 && \ + cp ../python_models/add_sub/model.py ${ENSEMBLE_MODEL_PATH}/fan_${model_name}/1/. && \ + cp ../python_models/fan_add_sub/config.pbtxt ${ENSEMBLE_MODEL_PATH}/fan_${model_name}/. && \ + (cd ${ENSEMBLE_MODEL_PATH}/fan_${model_name} && \ + sed -i "s/label_filename:.*//" config.pbtxt && \ + sed -i "s/model_name: \"ENSEMBLE_MODEL_NAME\"/model_name: \"${model_name}\"/" config.pbtxt && \ + sed -i "0,/name:.*/{s/name:.*/name: \"fan_${model_name}\"/}" config.pbtxt && \ + echo "max_batch_size: 64" >> config.pbtxt) + +# # custom float32 component of ensemble +cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 ${ENSEMBLE_MODEL_PATH}/. && \ + mkdir -p ${ENSEMBLE_MODEL_PATH}/nop_TYPE_FP32_-1/1 + +# Start server with 'serve' script +export SAGEMAKER_MULTI_MODEL=true +export SAGEMAKER_TRITON_LOG_VERBOSE=true + +serve > $SERVER_LOG 2>&1 & +SERVE_PID=$! +# Obtain Triton PID in such way as $! will return the script PID +sleep 1 +SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` +sagemaker_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + exit 1 +fi + +# API tests in default setting +set +e +python $SAGEMAKER_MULTI_MODEL_TEST SageMakerMultiModelTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $MULTI_MODEL_UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +unset SAGEMAKER_MULTI_MODEL + +unlink /opt/ml/models +rm -rf /opt/ml/models + +kill $SERVER_PID +wait $SERVE_PID +# MME end + +unlink /opt/ml/model +rm -rf /opt/ml/model + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_savedmodel_shape/saved_model_shape_test.py b/qa/L0_savedmodel_shape/saved_model_shape_test.py new file mode 100755 index 0000000000..b5ae13a680 --- /dev/null +++ b/qa/L0_savedmodel_shape/saved_model_shape_test.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + +np_dtype_string = np.dtype(object) + + +class SavedModelShapeTest(tu.TestResultCollector): + def _full_exact( + self, input_dtype, output0_dtype, output1_dtype, output0_raw, output1_raw, swap + ): + def _infer_exact_helper( + tester, + pf, + tensor_shape, + batch_size, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=True, + output1_raw=True, + model_version=None, + swap=False, + outputs=("OUTPUT0", "OUTPUT1"), + use_http=True, + use_grpc=True, + skip_request_id_check=False, + use_streaming=True, + correlation_id=0, + ): + for bs in (1, batch_size): + # model that does not support batching + if bs == 1: + iu.infer_exact( + tester, + "savedmodel_nobatch", + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + ) + # model that supports batching + iu.infer_exact( + tester, + "savedmodel", + (bs,) + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + ) + + input_size = 16 + + if tu.validate_for_tf_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + _infer_exact_helper( + self, + "savedmodel", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + def test_raw_bbb(self): + self._full_exact( + np.int8, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=True + ) + + def test_raw_sss(self): + self._full_exact( + np.int16, np.int16, np.int16, output0_raw=True, output1_raw=True, swap=True + ) + + def test_raw_iii(self): + self._full_exact( + np.int32, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=True + ) + + def test_raw_lll(self): + self._full_exact( + np.int64, np.int64, np.int64, output0_raw=True, output1_raw=True, swap=False + ) + + def test_raw_hhh(self): + self._full_exact( + np.float16, + np.float16, + np.float16, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_fff(self): + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=True, + ) + + def test_raw_hff(self): + self._full_exact( + np.float16, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_bii(self): + self._full_exact( + np.int8, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=False + ) + + def test_raw_ibb(self): + self._full_exact( + np.int32, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=False + ) + + def test_raw_ibs(self): + self._full_exact( + np.int32, np.int8, np.int16, output0_raw=True, output1_raw=True, swap=False + ) + + def test_raw_iff(self): + self._full_exact( + np.int32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_fii(self): + self._full_exact( + np.float32, + np.int32, + np.int32, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + def test_raw_ihs(self): + self._full_exact( + np.int32, + np.float16, + np.int16, + output0_raw=True, + output1_raw=True, + swap=False, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_savedmodel_shape/test.sh b/qa/L0_savedmodel_shape/test.sh new file mode 100755 index 0000000000..e059a5bf0b --- /dev/null +++ b/qa/L0_savedmodel_shape/test.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +CLIENT_LOG_BASE="./client_saved_model_shape" +INFER_TEST=saved_model_shape_test.py +EXPECTED_NUM_TESTS="13" + +DATADIR=`pwd`/models + +SERVER=/opt/tritonserver/bin/tritonserver +# Allow more time to exit. Ensemble brings in too many models +SERVER_ARGS="--model-repository=$DATADIR --exit-timeout-secs=120" +SERVER_LOG_BASE="./server_saved_model_shape" +source ../common/util.sh + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG_BASE* + +RET=0 + +SERVER_LOG=$SERVER_LOG_BASE.${TARGET}.log +CLIENT_LOG=$CLIENT_LOG_BASE.${TARGET}.log + +rm -fr models && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_noshape_model_repository models + +create_nop_version_dir `pwd`/models + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# python unittest seems to swallow ImportError and still return 0 +# exit code. So need to explicitly check CLIENT_LOG to make sure +# we see some running tests +python $INFER_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_scalar_io/scalar_test.py b/qa/L0_scalar_io/scalar_test.py new file mode 100755 index 0000000000..16aa1136ca --- /dev/null +++ b/qa/L0_scalar_io/scalar_test.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import unittest + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import np_to_triton_dtype + + +class ScalarIOTest(tu.TestResultCollector): + def setUp(self): + self._client = grpcclient.InferenceServerClient(url="localhost:8001") + self._backends = os.environ.get("BACKENDS", "onnx").split(",") + + def _send_request_and_verify_result(self, input, model_name): + inputs = [] + inputs.append( + grpcclient.InferInput("INPUT", input.shape, np_to_triton_dtype(input.dtype)) + ) + inputs[-1].set_data_from_numpy(input) + result = self._client.infer(inputs=inputs, model_name=model_name) + output = result.as_numpy("OUTPUT") + np.testing.assert_allclose(input, output) + + def test_scalar_io(self): + for backend in self._backends: + model_name = f"{backend}_scalar_1dim" + self._send_request_and_verify_result( + np.asarray([1], dtype=np.float32), model_name + ) + + model_name = f"{backend}_scalar_2dim" + self._send_request_and_verify_result( + np.asarray([[1]], dtype=np.float32), model_name + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_scalar_io/test.sh b/qa/L0_scalar_io/test.sh new file mode 100755 index 0000000000..ebb9a48d95 --- /dev/null +++ b/qa/L0_scalar_io/test.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +RET=0 +TEST_RESULT_FILE='test_results.txt' +BACKENDS="onnx" +export CUDA_VISIBLE_DEVICES=0 +DATADIR=/data/inferenceserver/${REPO_VERSION} + +rm -rf models +mkdir models +cp -r $DATADIR/qa_scalar_models/* models/ + +CLIENT_LOG="./client.log" +SCALAR_TEST=scalar_test.py +source ../common/util.sh + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +python3 $SCALAR_TEST >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** scalar_test.py FAILED. \n***" + cat $CLIENT_LOG + cat $SERVER_LOG + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Make sure the server fails loading the model if it has a dimension higher than +# 1 +sed -i "s/dims.*/dims:\[2\]/g" models/onnx_scalar_1dim/config.pbtxt +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Expected the server to fail loading \n***" + cat $SERVER_LOG + exit 1 +fi + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_sdk/grpc_test.cc b/qa/L0_sdk/grpc_test.cc new file mode 100644 index 0000000000..3f45e4ae25 --- /dev/null +++ b/qa/L0_sdk/grpc_test.cc @@ -0,0 +1,57 @@ +// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "grpc_client.h" + +namespace tc = triton::client; + +int +main(int argc, char* argv[]) +{ + std::unique_ptr client; + // Add a symbol from protobufs to verify correct linking + inference::ModelConfigResponse model_config; + tc::Error err = + tc::InferenceServerGrpcClient::Create(&client, "localhost:8001"); + if (!err.IsOk()) { + std::cerr << "InferenceServerGrpcClient::Create failed: " << err.Message() + << std::endl; + return 1; + } + + // No server is running so expect liveness call to fail + bool live; + err = client->IsServerLive(&live); + if (!err.IsOk()) { + std::cerr << "InferenceServerGrpcClient::IsServerLive expected fail: " + << err.Message() << std::endl; + return 0; + } + + return 1; +} diff --git a/qa/L0_sdk/http_test.cc b/qa/L0_sdk/http_test.cc new file mode 100644 index 0000000000..0b2a4da597 --- /dev/null +++ b/qa/L0_sdk/http_test.cc @@ -0,0 +1,55 @@ +// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "http_client.h" + +namespace tc = triton::client; + +int +main(int argc, char* argv[]) +{ + std::unique_ptr client; + tc::Error err = + tc::InferenceServerHttpClient::Create(&client, "localhost:8000"); + if (!err.IsOk()) { + std::cerr << "InferenceServerHttpClient::Create failed: " << err.Message() + << std::endl; + return 1; + } + + // No server is running so expect liveness call to fail + bool live; + err = client->IsServerLive(&live); + if (!err.IsOk()) { + std::cerr << "InferenceServerHttpClient::IsServerLive expected fail: " + << err.Message() << std::endl; + return 0; + } + + return 1; +} diff --git a/qa/L0_sdk/test.sh b/qa/L0_sdk/test.sh new file mode 100755 index 0000000000..20baf31639 --- /dev/null +++ b/qa/L0_sdk/test.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Install the tar file +rm -fr triton_client +mkdir triton_client +(cd triton_client && tar xzvf /workspace/*.tar.gz) + +set +e + +RET=0 + +# Check image_client and perf_client +if [[ ! -x "triton_client/bin/image_client" ]]; then + echo -e "*** image_client executable not present\n" + RET=1 +fi +if [[ ! -x "triton_client/bin/perf_analyzer" ]]; then + echo -e "*** perf_analyzer executable is not present\n" + RET=1 +fi +if [[ ! -x "triton_client/bin/perf_client" ]]; then + echo -e "*** perf_client link is not present\n" + RET=1 +fi + +# Check static libraries +for l in libgrpcclient.so libgrpcclient_static.a libhttpclient.so libhttpclient_static.a; do + if [[ ! -f "triton_client/lib/$l" ]]; then + echo -e "*** library $l not present\n" + RET=1 + fi +done + +client_lib=$(pwd)/triton_client/lib +client_inc=$(pwd)/triton_client/include + +# Test linking against the shared library +g++ grpc_test.cc -o grpc_test -I$client_inc -L$client_lib -lgrpcclient + +if [ $? -eq 0 ]; then + if [[ ! -x "./grpc_test" ]]; then + echo -e "*** grpc_test executable not present\n" + RET=1 + else + ./grpc_test + if [ $? -eq 0 ]; then + echo -e "\n***\n*** grpc_test exited with 0 PASSED\n***" + else + echo -e "\n***\n*** grpc_test exited with non-zero FAILED\n***" + RET=1 + fi + fi +else + echo -e "\n***\n*** Client headers build FAILED\n***" + RET=1 +fi + +# +# Test linking against static library +# + +grpc_static_libs="-Wl,--start-group $client_lib/*.a -Wl,--end-group" + +g++ grpc_test.cc $grpc_static_libs -o grpc_test_static -I$client_inc -lz -lssl -lcrypto -lpthread + +if [ $? -eq 0 ]; then + if [[ ! -x "./grpc_test_static" ]]; then + echo -e "*** grpc_test_static executable not present\n" + RET=1 + else + ./grpc_test_static + if [ $? -eq 0 ]; then + echo -e "\n***\n*** grpc_test_static exited with 0 PASSED\n***" + else + echo -e "\n***\n*** grpc_test_static exited with non-zero FAILED\n***" + RET=1 + fi + fi +else + echo -e "\n***\n*** Client headers build FAILED\n***" + RET=1 +fi + +# +# Test a simple app using Triton HTTP API +# + +# Test linking against the shared library +g++ http_test.cc -o http_test -I$client_inc -L$client_lib -lhttpclient + +if [ $? -eq 0 ]; then + if [[ ! -x "./http_test" ]]; then + echo -e "*** http_test executable not present\n" + RET=1 + else + ./http_test + if [ $? -eq 0 ]; then + echo -e "\n***\n*** http_test exited with 0 PASSED\n***" + else + echo -e "\n***\n*** http_test exited with non-zero FAILED\n***" + RET=1 + fi + fi +else + echo -e "\n***\n*** Client headers build FAILED\n***" + RET=1 +fi + +g++ http_test.cc $client_lib/libhttpclient_static.a $client_lib/libcurl.a -o http_test_static \ + -I$client_inc -lz -lssl -lcrypto -lpthread + +if [ $? -eq 0 ]; then + if [[ ! -x "./http_test_static" ]]; then + echo -e "*** http_test_static executable not present\n" + RET=1 + else + ./http_test_static + if [ $? -eq 0 ]; then + echo -e "\n***\n*** http_test_static exited with 0 PASSED\n***" + else + echo -e "\n***\n*** http_test_static exited with non-zero FAILED\n***" + RET=1 + fi + fi +else + echo -e "\n***\n*** Client headers build FAILED\n***" + RET=1 +fi + +# Check wheels, note that even TRITON_VERSION is passed as version field for +# wheel generation. The version number will be normalized by setuptools, so +# we need to replace the text here as well to match the normalized version. +WHLVERSION=`cat /workspace/TRITON_VERSION | sed 's/dev/\.dev0/'` +if [[ "aarch64" != $(uname -m) ]] ; then + WHLS="tritonclient-${WHLVERSION}-py3-none-any.whl \ + tritonclient-${WHLVERSION}-py3-none-manylinux1_x86_64.whl" +else + WHLS="tritonclient-${WHLVERSION}-py3-none-any.whl \ + tritonclient-${WHLVERSION}-py3-none-manylinux2014_aarch64.whl" +fi +for l in $WHLS; do + if [[ ! -f "triton_client/python/$l" ]]; then + echo -e "*** wheel $l not present\n" + echo -e "*** available wheels in triton_client/python\n" + ls -ltr triton_client/python + RET=1 + fi +done + +# Check wheel installation +python -c """import tritonclient; import tritonclient.grpc; import tritonclient.http; \ + import tritonclient.utils; import tritonclient.grpc.model_config_pb2; \ + import tritonclient.grpc.service_pb2; import tritonclient.grpc.service_pb2_grpc; \ + import tritonclient.utils.cuda_shared_memory; import tritonclient.utils.shared_memory""" +RET=$(($RET+$?)) + +EXECUTABLES="perf_analyzer perf_client" +for l in $EXECUTABLES; do + if [ $(which -a $l | grep "/usr/local/bin/$l" | wc -l) -ne 1 ]; then + which -a $l + echo -e "*** $l executable not installed by tritonclient wheel\n" + RET=1 + fi +done + +# Check java client +if [[ ! -e "triton_client/java/java-api-0.0.1.jar" ]]; then + echo -e "*** java-api-0.0.1.jar not present\n" + RET=1 +fi +if [[ ! -e "triton_client/java/examples/MemoryGrowthTest.jar" ]]; then + echo -e "*** MemoryGrowthTest.jar not present\n" + RET=1 +fi +if [[ ! -e "triton_client/java/examples/SimpleInferClient.jar" ]]; then + echo -e "*** SimpleInferClient.jar not present\n" + RET=1 +fi +if [[ ! -e "triton_client/java/examples/SimpleInferPerf.jar" ]]; then + echo -e "*** SimpleInferPerf.jar not present\n" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_secure_grpc/test.sh b/qa/L0_secure_grpc/test.sh new file mode 100755 index 0000000000..784613c6a2 --- /dev/null +++ b/qa/L0_secure_grpc/test.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +TEST_CLIENT_AIO_PY=../clients/simple_grpc_aio_infer_client.py +TEST_CLIENT_PY=../clients/simple_grpc_infer_client.py +TEST_CLIENT=../clients/simple_grpc_infer_client + +CLIENT_LOG=`pwd`/client.log +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_BASE_ARGS="--model-repository=$DATADIR --grpc-use-ssl=1 --grpc-server-cert server.crt --grpc-server-key server.key --grpc-root-cert ca.crt" +source ../common/util.sh + +rm -fr *.log *.log.* + +# Generate valid CA +openssl genrsa -passout pass:1234 -des3 -out ca.key 4096 +openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA" + +# Generate valid Server Key/Cert +openssl genrsa -passout pass:1234 -des3 -out server.key 4096 +openssl req -passin pass:1234 -new -key server.key -out server.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost" +openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt + +# Remove passphrase from the Server Key +openssl rsa -passin pass:1234 -in server.key -out server.key + +# Generate valid Client Key/Cert +openssl genrsa -passout pass:1234 -des3 -out client.key 4096 +openssl req -passin pass:1234 -new -key client.key -out client.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost" +openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt + +# Remove passphrase from Client Key +openssl rsa -passin pass:1234 -in client.key -out client.key + +# Create mutated client key (Make first char of each like capital) +cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key +cp client.crt client2.crt && sed -i "s/\b\(.\)/\u\1/g" client2.crt + +# Test all 3 SSL/TLS cases, server authentication, mutual authentication and when both flags are specified +for CASE in server mutual both; do + if [ "$CASE" == "server" ]; then + SERVER_ARGS="$SERVER_BASE_ARGS --grpc-use-ssl=1" + elif [ "$CASE" == "mutual" ]; then + SERVER_ARGS="$SERVER_BASE_ARGS --grpc-use-ssl-mutual=1" + else + SERVER_ARGS="$SERVER_BASE_ARGS --grpc-use-ssl=1 --grpc-use-ssl-mutual=1" + fi + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + # Test basic inference using grpc secure channel + $TEST_CLIENT_PY -v --ssl --root-certificates ca.crt --private-key client.key --certificate-chain client.crt >> ${CLIENT_LOG}.${CASE}.ssl_infer 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.ssl_infer + RET=1 + fi + $TEST_CLIENT_AIO_PY -v --ssl --root-certificates ca.crt --private-key client.key --certificate-chain client.crt >> ${CLIENT_LOG}.${CASE}.ssl_infer.aio 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.ssl_infer.aio + RET=1 + fi + + $TEST_CLIENT -v --ssl --root-certificates ca.crt --private-key client.key --certificate-chain client.crt >> ${CLIENT_LOG}.${CASE}.c++.ssl_infer 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.c++.ssl_infer + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test failure cases for SSL +for CASE in server mutual; do + if [ "$CASE" == "server" ]; then + SERVER_ARGS="$SERVER_BASE_ARGS --grpc-use-ssl=1" + elif [ "$CASE" == "mutual" ]; then + SERVER_ARGS="$SERVER_BASE_ARGS --grpc-use-ssl-mutual=1" + fi + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + # Test inference client using grpc secure channel without ssl + $TEST_CLIENT_PY -v >> ${CLIENT_LOG}.${CASE}.no_ssl_fail_infer 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.no_ssl_fail_infer + echo -e "\n***\n*** Expected test failure\n***" + else + RET=1 + fi + $TEST_CLIENT_AIO_PY -v >> ${CLIENT_LOG}.${CASE}.no_ssl_fail_infer.aio 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.no_ssl_fail_infer.aio + echo -e "\n***\n*** Expected test failure\n***" + else + RET=1 + fi + + $TEST_CLIENT -v >> ${CLIENT_LOG}.${CASE}.c++.no_ssl_fail_infer 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.c++.no_ssl_fail_infer + echo -e "\n***\n*** Expected test failure\n***" + else + RET=1 + fi + + # Test inference client using grpc secure channel with incorrect ssl creds + $TEST_CLIENT_PY -v --ssl --root-certificates ca.crt --private-key client2.key --certificate-chain client2.crt >> ${CLIENT_LOG}.${CASE}.wrong_ssl_fail_infer 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.wrong_ssl_fail_infer + echo -e "\n***\n*** Expected test failure\n***" + else + RET=1 + fi + $TEST_CLIENT_AIO_PY -v --ssl --root-certificates ca.crt --private-key client2.key --certificate-chain client2.crt >> ${CLIENT_LOG}.${CASE}.wrong_ssl_fail_infer.aio 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.wrong_ssl_fail_infer.aio + echo -e "\n***\n*** Expected test failure\n***" + else + RET=1 + fi + + $TEST_CLIENT -v --ssl --root-certificates ca.crt --private-key client2.key --certificate-chain client2.crt >> ${CLIENT_LOG}.${CASE}.c++.wrong_ssl_fail_infer 2>&1 + if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}.${CASE}.c++.wrong_ssl_fail_infer + echo -e "\n***\n*** Expected test failure\n***" + else + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_sequence_batcher/request_timeout_models/custom_sequence_int32_timeout/config.pbtxt b/qa/L0_sequence_batcher/request_timeout_models/custom_sequence_int32_timeout/config.pbtxt new file mode 100644 index 0000000000..d9be228d5d --- /dev/null +++ b/qa/L0_sequence_batcher/request_timeout_models/custom_sequence_int32_timeout/config.pbtxt @@ -0,0 +1,62 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "identity" +max_batch_size: 1 + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] + +sequence_batching { + max_sequence_idle_microseconds: 50000000 +} + +parameters [ + { + key: "execute_delay_ms" + value: { string_value: "5000" } + } +] diff --git a/qa/L0_sequence_batcher/sequence_batcher_test.py b/qa/L0_sequence_batcher/sequence_batcher_test.py new file mode 100755 index 0000000000..3e6cfc032a --- /dev/null +++ b/qa/L0_sequence_batcher/sequence_batcher_test.py @@ -0,0 +1,3618 @@ +#!/usr/bin/env python + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import random +import threading +import time +import unittest +from builtins import str +from functools import partial + +import numpy as np +import sequence_util as su +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) + +USE_GRPC = os.environ.get("USE_GRPC", 1) != "0" +USE_HTTP = os.environ.get("USE_HTTP", 1) != "0" +assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero" +if USE_GRPC and USE_HTTP: + _protocols = ("http", "grpc") +elif USE_GRPC: + _protocols = ("grpc",) +else: + _protocols = ("http",) + +BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx plan custom python") +ENSEMBLES = bool(int(os.environ.get("ENSEMBLES", 1))) + +NO_BATCHING = int(os.environ["NO_BATCHING"]) == 1 +MODEL_INSTANCES = int(os.environ["MODEL_INSTANCES"]) +IMPLICIT_STATE = int(os.environ["IMPLICIT_STATE"]) == 1 + +# Use initial state for implicit state +INITIAL_STATE_FILE = int(os.environ["INITIAL_STATE_FILE"]) == 1 + +_trials = () +if NO_BATCHING: + for backend in BACKENDS.split(" "): + if backend != "custom": + _trials += (backend + "_nobatch",) +elif os.environ["BATCHER_TYPE"] == "VARIABLE": + for backend in BACKENDS.split(" "): + if (backend != "libtorch") and (backend != "custom"): + _trials += (backend,) +else: + _trials = BACKENDS.split(" ") + +# Add ensemble to the _trials +ENSEMBLE_PREFIXES = ["simple_", "sequence_", "fan_"] + +if ENSEMBLES: + res = [] + for trial in _trials: + res.append(trial) + if "custom" in trial: + continue + for ensemble_prefix in ENSEMBLE_PREFIXES: + res.append(ensemble_prefix + trial) + _trials = tuple(res) + +_ragged_batch_supported_trials = list() +if "custom" in _trials: + _ragged_batch_supported_trials = ("custom",) + +# Not all models can be tested for ragged handling because the models +# don't deal well with non-size-1 shapes +_ragged_batch_not_supported_trials = list() +if os.environ["BATCHER_TYPE"] == "VARIABLE": + if "custom" in _trials: + _ragged_batch_not_supported_trials.append("custom") + if "plan" in _trials: + _ragged_batch_not_supported_trials.append("plan") + if "onnx" in _trials: + _ragged_batch_not_supported_trials.append("onnx") + +_max_sequence_idle_ms = 5000 + + +# Checks whether the provided model name belongs to an ensemble +# model. +def is_ensemble(model_name): + for prefix in ENSEMBLE_PREFIXES: + if model_name.startswith(prefix): + return True + return False + + +class SequenceBatcherTest(su.SequenceBatcherTestUtil): + def get_datatype(self, trial): + # Get the datatype to use based on what models are available (see test.sh) + if "plan" in trial: + return (np.float32,) + if "custom" in trial: + return (np.int32,) + if "savedmodel" in trial: + return (np.float32, np.bool_) + if "graphdef" in trial: + return (np.dtype(object), np.bool_) + + # Only test the string data type for ONNX and libtorch models in implicit state + if IMPLICIT_STATE: + if "onnx" in trial: + return (np.dtype(object), np.int32, np.bool_) + if NO_BATCHING: + if "libtorch" in trial: + return (np.dtype(object), np.int32, np.bool_) + + return (np.int32, np.bool_) + + def get_expected_result(self, expected_result, value, trial, flag_str=None): + # Adjust the expected_result for models that + # could not implement the full accumulator. See + # qa/common/gen_qa_sequence_models.py for more + # information. + if ( + (not NO_BATCHING and ("custom" not in trial)) + or ("graphdef" in trial) + or ("plan" in trial) + or ("onnx" in trial) + ) or ("libtorch" in trial): + expected_result = value + if (flag_str is not None) and ("start" in flag_str): + expected_result += 1 + return expected_result + + def get_expected_result_implicit( + self, expected_result, value, trial, flag_str=None, dtype=None + ): + if dtype == np.dtype(object) and trial.startswith("onnx"): + return value + + if INITIAL_STATE_FILE: + # When the INITIAL_STATE_FILE is set the initial value + # used for sequence will be 100 instead of zero and the + # results will be offset by the same amount. + return expected_result + 100 + else: + return expected_result + + def test_simple_sequence(self): + # Send one sequence and check for correct accumulator + # result. The result should be returned immediately. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + dtypes = self.get_datatype(trial) + + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + try: + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn( + "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ + ) + expected_result = ( + self.get_expected_result(45, 9, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 45, 9, trial, "end", dtype + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + 5, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + ( + ("start", 1, None, None), + (None, 2, None, None), + (None, 3, None, None), + (None, 4, None, None), + (None, 5, None, None), + (None, 6, None, None), + (None, 7, None, None), + (None, 8, None, None), + ("end", 9, None, None), + ), + expected_result, + protocol, + sequence_name="{}_{}".format( + self._testMethodName, protocol + ), + ) + + self.check_deferred_exception() + self.check_status( + model_name, {1: 9 * (idx + 1)}, 9 * (idx + 1), 9 * (idx + 1) + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_length1_sequence(self): + # Send a length-1 sequence and check for correct accumulator + # result. The result should be returned immediately. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + dtypes = self.get_datatype(trial) + + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + try: + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn( + "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ + ) + expected_result = ( + self.get_expected_result(42, 42, trial, "start,end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 42, 42, trial, "start,end", dtype + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + 99, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + (("start,end", 42, None, None),), + expected_result, + protocol, + sequence_name="{}_{}".format( + self._testMethodName, protocol + ), + ) + + self.check_deferred_exception() + self.check_status( + model_name, {1: idx + 1}, (idx + 1), (idx + 1) + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_batch_size(self): + # Send sequence with a batch-size > 1 and check for error. + + # When 4 model instances the max-batch-size is 1 so can't test + # since that gives a different error: "batch-size 2 exceeds + # maximum batch size" + if (MODEL_INSTANCES == 4) or NO_BATCHING: + return + + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + dtypes = self.get_datatype(trial) + + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + try: + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn( + "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ + ) + expected_result = ( + self.get_expected_result(10, 9, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 10, 9, trial, "end", dtype + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + 27, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + (("start", 1, None, None), ("end", 9, None, None)), + expected_result, + protocol, + batch_size=2, + sequence_name="{}_{}".format( + self._testMethodName, protocol + ), + ) + + self.check_deferred_exception() + self.assertTrue(False, "expected error") + except Exception as ex: + for prefix in ENSEMBLE_PREFIXES: + if model_name.startswith(prefix): + base_model_name = model_name[(len(prefix)) :] + self.assertTrue( + ex.message().startswith( + str( + "in ensemble '{}', " + + "inference request to model '{}' must specify " + + "batch-size 1 due to requirements of sequence " + + "batcher" + ).format(model_name, base_model_name) + ) + ) + return + self.assertTrue( + ex.message().startswith( + str( + "inference request to model '{}' must specify " + + "batch-size 1 due to requirements of sequence " + + "batcher" + ).format(model_name) + ) + ) + + def test_no_correlation_id(self): + # Send sequence without correlation ID and check for error. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + try: + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn( + "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ + ) + expected_result = ( + self.get_expected_result(10, 9, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 10, 9, trial, "end", dtype + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + 0, # correlation_id = 0 + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + (("start", 1, None, None), ("end", 9, None, None)), + expected_result, + protocol, + sequence_name="{}_{}".format( + self._testMethodName, protocol + ), + ) + + self.check_deferred_exception() + self.assertTrue(False, "expected error") + except Exception as ex: + for prefix in ENSEMBLE_PREFIXES: + if model_name.startswith(prefix): + base_model_name = model_name[(len(prefix)) :] + self.assertTrue( + ex.message().startswith( + str( + "in ensemble '{}', " + + "inference request to model '{}' must specify a " + + "non-zero or non-empty correlation ID" + ).format(model_name, base_model_name) + ) + ) + return + self.assertTrue( + ex.message().startswith( + str( + "inference request to model '{}' must specify a " + + "non-zero or non-empty correlation ID" + ).format(model_name) + ) + ) + + def test_no_sequence_start(self): + # Send sequence without start flag for never before seen + # correlation ID. Expect failure. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + try: + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn( + "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ + ) + + expected_result = ( + self.get_expected_result(6, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, "end", dtype + ) + ) + self.check_sequence( + trial, + model_name, + dtype, + 37469245, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + ( + (None, 1, None, None), + (None, 2, None, None), + ("end", 3, None, None), + ), + expected_result, + protocol, + sequence_name="{}_{}".format( + self._testMethodName, protocol + ), + ) + + self.check_deferred_exception() + self.assertTrue(False, "expected error") + except Exception as ex: + print(model_name + "-> " + ex.message()) + for prefix in ENSEMBLE_PREFIXES: + if model_name.startswith(prefix): + base_model_name = model_name[(len(prefix)) :] + self.assertTrue( + ex.message().startswith( + str( + "in ensemble '{}', " + + "inference request for sequence 37469245 to " + + "model '{}' must specify the START flag on the first " + + "request of the sequence" + ).format(model_name, base_model_name) + ) + ) + return + self.assertTrue( + ex.message().startswith( + str( + "inference request for sequence 37469245 to " + + "model '{}' must specify the START flag on the first " + + "request of the sequence" + ).format(model_name) + ) + ) + + def test_no_sequence_start2(self): + # Send sequence without start flag after sending a valid + # sequence with the same correlation ID. Expect failure for + # the second sequence. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + try: + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn( + "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ + ) + expected_result = ( + self.get_expected_result(6, 3, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, None, dtype + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + 3, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + ( + ("start", 1, None, None), + (None, 2, None, None), + ("end", 3, None, None), + (None, 55, None, None), + ), + expected_result, + protocol, + sequence_name="{}_{}".format( + self._testMethodName, protocol + ), + ) + + self.check_status( + model_name, {1: 3 * (idx + 1)}, 3 * (idx + 1), 3 * (idx + 1) + ) + self.check_deferred_exception() + self.assertTrue(False, "expected error") + except Exception as ex: + for prefix in ENSEMBLE_PREFIXES: + if model_name.startswith(prefix): + base_model_name = model_name[(len(prefix)) :] + self.assertTrue( + ex.message().startswith( + str( + "in ensemble '{}', " + + "inference request for sequence 3 to model '{}' must " + + "specify the START flag on the first request of " + + "the sequence" + ).format(model_name, base_model_name) + ) + ) + return + self.assertTrue( + ex.message().startswith( + str( + "inference request for sequence 3 to model '{}' must " + + "specify the START flag on the first request of " + + "the sequence" + ).format(model_name) + ) + ) + + def test_no_sequence_end(self): + # Send sequence without end flag. Use same correlation ID to + # send another sequence. The first sequence will be ended + # automatically but the second should complete successfully. + for trial in _trials: + # Run on different protocols. + for idx, protocol in enumerate(_protocols): + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + try: + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn( + "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ + ) + expected_result = ( + self.get_expected_result(51, 9, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 51, 9, trial, "end", dtype + ) + ) + + self.check_sequence( + trial, + model_name, + dtype, + 4566, + (4000, None), + # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) + ( + ("start", 1, None, None), + (None, 2, None, None), + ("start", 42, None, None), + ("end", 9, None, None), + ), + expected_result, + protocol, + sequence_name="{}_{}".format( + self._testMethodName, protocol + ), + ) + + self.check_deferred_exception() + self.check_status( + model_name, {1: 4 * (idx + 1)}, 4 * (idx + 1), 4 * (idx + 1) + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_half_batch(self): + # Test model instances that together are configured with + # total-batch-size 4. Send two equal-length sequences in + # parallel and make sure they get completely batched into + # batch-size 2 inferences. + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3, 4), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (0, 9, 5, 13), dtype, 1 + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 8) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + expected_result = ( + self.get_expected_result(10, 4, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 10, 4, trial, "end", dtype + ) + ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 987, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1, None), + (None, 2, None), + (None, 3, None), + ("end", 4, None), + ), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(27, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 27, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 988, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 0, None), + (None, 9, None), + (None, 5, None), + ("end", 13, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 8}, 8, 8) + else: + stats_batch_size = 2 if MODEL_INSTANCES == 1 else 1 + exec_cnt = 4 if MODEL_INSTANCES == 1 else 8 + self.check_status( + model_name, + {stats_batch_size: 4 * min(2, MODEL_INSTANCES)}, + exec_cnt, + 8, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + + def test_skip_batch(self): + # Test model instances together are configured with + # total-batch-size 4. Send four sequences in parallel where + # two sequences have shorter length so that padding must be + # applied correctly for the longer sequences. + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13, 14), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113, 1114), dtype, 3 + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(4, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 4, 3, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(50, 14, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 50, 14, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + (None, 13, None), + ("end", 14, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(224, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 224, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 111, None), ("end", 113, None)), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(4450, 1114, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 4450, 1114, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + (None, 1113, None), + ("end", 1114, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[1].start() + threads[3].start() + time.sleep(3) + threads[0].start() + threads[2].start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 12}, 12, 12) + else: + # Batch size is 4 for the first two inferences and + # then 2 for the second two inferences. This is + # because we request the longer sequences first + # (threads 1 and 3) in slots 0 and 1 and so after + # shorter sequences are complete there are only slots + # 0 and 1 to execute. + if MODEL_INSTANCES == 1: + self.check_status(model_name, {2: 2, 4: 2}, 4, 12) + elif MODEL_INSTANCES == 2: + self.check_status(model_name, {2: 4, 1: 4}, 8, 12) + elif MODEL_INSTANCES == 4: + self.check_status(model_name, {1: 12}, 12, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def test_full_batch(self): + # Test model instances together are configured with + # total-batch-size 4. Send four equal-length sequences in + # parallel and make sure they get completely batched into + # batch-size 4 inferences. + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + expected_result = ( + self.get_expected_result(6, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, "end", dtype + ) + ) + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + expected_result = ( + self.get_expected_result(36, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 36, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + ("end", 13, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + expected_result = ( + self.get_expected_result(336, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 336, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(3336, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 12}, 12, 12) + else: + self.check_status( + model_name, + {(4 / MODEL_INSTANCES): (3 * MODEL_INSTANCES)}, + 3 * MODEL_INSTANCES, + 12, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def test_ragged_batch(self): + # Test model instances that together are configured with + # total-batch-size 4. The sequences use the different size + # inputs and the inputs are *not* marked as allowing ragged + # batch. Send four equal-length sequences in parallel and + # make sure they don't get batched. + + # Only works with 1 model instance since want to test all + # sequences batching together. + if MODEL_INSTANCES != 1: + return + + for trial in _ragged_batch_not_supported_trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0, tensor_shape=(2,) + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13), dtype, 1, tensor_shape=(2,) + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 113), dtype, 2, tensor_shape=(1,) + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3, tensor_shape=(3,) + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(6 * 2, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (2,), + }, + ) + ) + + expected_result = ( + self.get_expected_result(36 * 2, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 36, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + ("end", 13, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (2,), + }, + ) + ) + expected_result = ( + self.get_expected_result(336, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 336, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (1,), + }, + ) + ) + expected_result = ( + self.get_expected_result(3336 * 3, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (3,), + }, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + time.sleep(3) + threads[3].start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 12}, 12, 12) + else: + self.check_status(model_name, {4: 9}, 9, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def test_ragged_batch_allowed(self): + # Test model instances that together are configured with + # total-batch-size 4. The sequences use the different size + # inputs. Send four equal-length sequences in parallel and + # make sure they get batched appropriately even with size + # differences. + + # Only works with 1 model instance since want to test all + # sequences batching together. + if MODEL_INSTANCES != 1: + return + + for trial in _ragged_batch_supported_trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0, tensor_shape=(2,) + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13), dtype, 1, tensor_shape=(2,) + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 113), dtype, 2, tensor_shape=(1,) + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3, tensor_shape=(3,) + ) + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + + expected_result = ( + self.get_expected_result(6 * 2, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6 * 2, 3, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (2,), + }, + ) + ) + + expected_result = ( + self.get_expected_result(36 * 2, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 36 * 2, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + ("end", 13, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (2,), + }, + ) + ) + expected_result = ( + self.get_expected_result(336, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 336, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (1,), + }, + ) + ) + expected_result = ( + self.get_expected_result(3336 * 3, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336 * 3, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "tensor_shape": (3,), + }, + ) + ) + + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 12}, 12, 12) + else: + self.check_status(model_name, {4: 3}, 3, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def test_backlog(self): + # Test model instances together are configured with + # total-max-batch-size 4. Send 5 equal-length sequences in + # parallel and make sure they get completely batched into + # batch-size 4 inferences plus the 5th should go in the + # backlog and then get handled once there is a free slot. + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111, 11112, 11113), dtype, 4 + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(6, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(36, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 36, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + ("end", 13, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(336, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 336, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(3336, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + expected_result = ( + self.get_expected_result(33336, 11113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 33336, 11113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1005, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11111, None), + (None, 11112, None), + ("end", 11113, None), + ), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 15}, 15, 15) + else: + if MODEL_INSTANCES == 1: + self.check_status(model_name, {4: 3, 1: 3}, 6, 15) + elif MODEL_INSTANCES == 2: + self.check_status(model_name, {2: 6, 1: 3}, 9, 15) + else: + self.check_status(model_name, {1: 15}, 15, 15) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + + def test_backlog_fill(self): + # Test model instances together are configured with + # total-max-batch-size 4. Send 4 sequences in parallel, two of + # which are shorter. Send 2 additional sequences that should + # go into backlog but should immediately fill into the short + # sequences. + + # Only works with 1 model instance since otherwise an instance + # can run ahead and handle more work than expected (leads to + # intermittent failures) + if MODEL_INSTANCES != 1: + return + + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111,), dtype, 4 + ) + precreated_shm5_handles = self.precreate_register_regions( + (22222,), dtype, 5 + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 10 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 2 + ) + + threads = [] + expected_result = ( + self.get_expected_result(6, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(24, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 24, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11, None), ("end", 13, None)), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(224, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 224, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 111, None), ("end", 113, None)), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(3336, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(11111, 11111, trial, "start,end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 11111, 11111, trial, "start,end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1005, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start,end", 11111, None),), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(22222, 22222, trial, "start,end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 22222, 22222, trial, "start,end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1006, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start,end", 22222, None),), + expected_result, + precreated_shm5_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + threads[3].start() + time.sleep(3) + threads[4].start() + threads[5].start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 12}, 12, 12) + else: + self.check_status(model_name, {4: 3}, 3, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + self.cleanup_shm_regions(precreated_shm5_handles) + + def test_backlog_fill_no_end(self): + # Test model instances together are configured with + # total-max-batch-size 4. Send 4 sequences in parallel, two of + # which are shorter. Send 2 additional sequences that should + # go into backlog but should immediately fill into the short + # sequences. One of those sequences is filled before it gets + # its end request. + + # Only works with 1 model instance since otherwise an instance + # can run ahead and handle more work than expected (leads to + # intermittent failures) + if MODEL_INSTANCES != 1: + return + + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111,), dtype, 4 + ) + precreated_shm5_handles = self.precreate_register_regions( + (22222, 22223, 22224), dtype, 5 + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 10 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 3 + ) + + threads = [] + expected_result = ( + self.get_expected_result(6, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(24, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 24, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11, None), ("end", 13, None)), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(224, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 224, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 111, None), ("end", 113, None)), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(3336, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(11111, 11111, trial, "start,end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 11111, 11111, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1005, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start,end", 11111, None),), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(66669, 22224, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 66669, 22224, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1006, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 22222, None), + (None, 22223, None), + ("end", 22224, 2000), + ), + expected_result, + precreated_shm5_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + time.sleep(2) + threads[1].start() + time.sleep(2) + threads[2].start() + time.sleep(2) + threads[3].start() + time.sleep(2) + threads[4].start() + time.sleep(2) + threads[5].start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 14}, 14, 14) + else: + # Expecting 3 batch-size 4 inferences and then the + # 1006 sequence will follow 1003 (a different + # implementation could also follow 1002...) + self.check_status(model_name, {4: 3, 3: 2}, 5, 14) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + self.cleanup_shm_regions(precreated_shm5_handles) + + def test_backlog_same_correlation_id(self): + # Test model instances together are configured with + # total-max-batch-size 4. Send 4 equal-length sequences in + # parallel and make sure they get completely batched into + # batch-size 4 inferences. Send a 5th with the same + # correlation ID as one of the first four. + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 2, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111, 11113), dtype, 4 + ) + + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 2 + ) + + threads = [] + expected_result = ( + self.get_expected_result(6, 3, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 6, 3, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 2, None), ("end", 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(36, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 36, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + ("end", 13, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(336, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 336, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(3336, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 3336, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(22224, 11113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 22224, 11113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11111, None), ("end", 11113, None)), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + threads[3].start() + time.sleep(3) + threads[4].start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 14}, 14, 14) + else: + if MODEL_INSTANCES != 4: + batch_exec = { + (4 / MODEL_INSTANCES): (3 * MODEL_INSTANCES), + 1: 2, + } + else: + batch_exec = {1: (3 * MODEL_INSTANCES) + 2} + self.check_status( + model_name, batch_exec, (3 * MODEL_INSTANCES) + 2, 14 + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + + def test_backlog_same_correlation_id_no_end(self): + # Test model instances together are configured with + # total-max-batch-size 4. Send 4 sequences in parallel and + # make sure they get completely batched into batch-size 4 + # inferences. One of the sequences is shorter and does not + # have an end marker but has same correlation ID as the 5th + # sequence. We expect that short sequence to get ended early + # (because of the same correlation ID) and make room for the + # 5th sequence. + + # Only works with 1 model instance since otherwise an instance + # can run ahead and handle more work than expected (leads to + # intermittent failures) + if MODEL_INSTANCES != 1: + return + + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 12, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 112, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111, 11113), dtype, 4 + ) + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 16 + ) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(4, 3, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit(4, 3, trial, None, dtype) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), (None, 3, None)), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(48, 13, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 48, 13, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + (None, 12, None), + ("end", 13, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(448, 113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 448, 113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, None), + (None, 112, None), + ("end", 113, None), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(4448, 1113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 4448, 1113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + (None, 1112, None), + ("end", 1113, None), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(22224, 11113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 22224, 11113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11111, None), ("end", 11113, None)), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + threads[3].start() + time.sleep(2) + threads[4].start() + for t in threads: + t.join() + self.check_deferred_exception() + if is_ensemble(model_name): + # Requests do not get batched for the ensemble model + self.check_status(model_name, {1: 16}, 16, 16) + else: + self.check_status(model_name, {4: 4}, 4, 16) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + + def test_backlog_sequence_timeout(self): + # Test model instances together are configured with + # total-max-batch-size 4. Send 4 sequences in parallel and + # make sure they get completely batched into batch-size 4 + # inferences. One of the sequences has a long delay that + # causes it to timeout and that allows a 5th sequence to come + # out of the backlog and finish. The timed-out sequence will + # then send the delayed inference but it will appear as a new + # sequence and so fail because it doesn't have the START flag. + + # Only works with 1 model instance since otherwise an instance + # can run ahead and handle more work than expected (leads to + # intermittent failures) + if MODEL_INSTANCES != 1: + return + + for trial in _trials: + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # Skip bool type ensemble models + if (any(word in trial for word in ENSEMBLE_PREFIXES)) and ( + dtype == np.bool_ + ): + continue + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1, 3), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 12, 13), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 112, 112, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1112, 1113), dtype, 3 + ) + precreated_shm4_handles = self.precreate_register_regions( + (11111, 11113), dtype, 4 + ) + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for all sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(4, 3, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit(4, 3, trial, None, dtype) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1, None), + (None, 3, _max_sequence_idle_ms + 1000), + ), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(48, 13, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 48, 13, trial, None, dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, _max_sequence_idle_ms / 2), + (None, 12, _max_sequence_idle_ms / 2), + ("end", 13, _max_sequence_idle_ms / 2), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(448, 113, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 448, 113, trial, None, dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1003, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 111, None), + (None, 112, _max_sequence_idle_ms / 2), + (None, 112, _max_sequence_idle_ms / 2), + ("end", 113, _max_sequence_idle_ms / 2), + ), + expected_result, + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(4448, 1113, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 4448, 1113, trial, None, dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1004, + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, _max_sequence_idle_ms / 2), + (None, 1112, _max_sequence_idle_ms / 2), + ("end", 1113, _max_sequence_idle_ms / 2), + ), + expected_result, + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(22224, 11113, trial, "end") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 22224, 11113, trial, "end", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1005, + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 11111, None), ("end", 11113, None)), + expected_result, + precreated_shm4_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + threads[1].start() + threads[2].start() + threads[3].start() + time.sleep(2) + threads[4].start() + for t in threads: + t.join() + + self.check_deferred_exception() + self.assertTrue(False, "expected error") + except Exception as ex: + for prefix in ENSEMBLE_PREFIXES: + if model_name.startswith(prefix): + base_model_name = model_name[(len(prefix)) :] + self.assertTrue( + ex.message().startswith( + str( + "in ensemble '{}', " + + "inference request for sequence 1001 to " + + "model '{}' must specify the START flag on the first " + + "request of the sequence" + ).format(model_name, base_model_name) + ) + ) + return + self.assertTrue( + ex.message().startswith( + str( + "inference request for sequence 1001 to " + + "model '{}' must specify the START flag on the first " + + "request of the sequence" + ).format(model_name) + ) + ) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + self.cleanup_shm_regions(precreated_shm4_handles) + + def test_queue_delay_no_min_util(self): + # Test model that have set max queue delay but minimum slot utilization + # is 0. Send 2 sequences in parallel and make sure they get completely + # batched into batch-size 2 inferences. The first sequence only has one + # request while the second sequence has two, so expecting the second + # execution to be a batch of 'null, seq 2'. The executions should not be + # waited. + + for trial in _trials: + is_ensemble = False + for prefix in ENSEMBLE_PREFIXES: + if prefix in trial: + is_ensemble = True + break + if is_ensemble: + continue + + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1,), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12), dtype, 1 + ) + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain 2 sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(1, 1, trial, "start") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 1, 1, trial, "start", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (2000, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None),), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(23, 12, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 23, 12, trial, None, dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (2000, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + + self.check_deferred_exception() + self.check_status(model_name, {2: 2}, 2, 3) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + + def test_queue_delay_half_min_util(self): + # Test model that have set max queue delay but minimum slot utilization + # is 0.5. Send 2 sequences in parallel and make sure they get completely + # batched into batch-size 2 inferences. The first sequence only has one + # request while the second sequence has two, so expecting the second + # execution to be a batch of 'null, seq 2'. The second execution should + # be waited until the max queue delay is exceeded for sequence 2. + + for trial in _trials: + is_ensemble = False + for prefix in ENSEMBLE_PREFIXES: + if prefix in trial: + is_ensemble = True + break + if is_ensemble: + continue + + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + "_half" + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1,), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12), dtype, 1 + ) + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain 2 sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(1, 1, trial, "start") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 1, 1, trial, "start", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (2000, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None),), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(23, 12, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 23, 12, trial, None, dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (4000, 3000), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + + self.check_deferred_exception() + self.check_status(model_name, {2: 2}, 2, 3) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + + def test_queue_delay_full_min_util(self): + # Test model that have set max queue delay but minimum slot utilization + # is 1. Send 2 sequences in parallel and make sure they get completely + # batched into batch-size 2 inferences. The first sequence only has one + # request while the second sequence has two, so expecting the second + # execution to be a batch of 'null, seq 2'. Both executions should be + # waited until the max queue delay is exceeded. + + for trial in _trials: + is_ensemble = False + for prefix in ENSEMBLE_PREFIXES: + if prefix in trial: + is_ensemble = True + break + if is_ensemble: + continue + + dtypes = self.get_datatype(trial) + for dtype in dtypes: + model_name = tu.get_sequence_model_name(trial, dtype) + "_full" + # For bool type control models, use int32 as I/O types + if dtype == np.bool_: + dtype = np.int32 + + self.clear_deferred_exceptions() + + precreated_shm0_handles = self.precreate_register_regions( + (1,), dtype, 0 + ) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12), dtype, 1 + ) + try: + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain 2 sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + expected_result = ( + self.get_expected_result(1, 1, trial, "start") + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 1, 1, trial, "start", dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1001, + (4000, 3000), + # (flag_str, value, pre_delay_ms) + (("start", 1, None),), + expected_result, + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + expected_result = ( + self.get_expected_result(23, 12, trial, None) + if not IMPLICIT_STATE + else self.get_expected_result_implicit( + 23, 12, trial, None, dtype + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + 1002, + (6000, 5000), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, 2000), + ), + expected_result, + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + + self.check_deferred_exception() + self.check_status(model_name, {2: 2}, 2, 3) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + + +class SequenceBatcherRequestTimeoutTest(su.SequenceBatcherTestUtil): + def setUp(self): + super(SequenceBatcherRequestTimeoutTest, self).setUp() + # By default, find tritonserver on "localhost", but can be overridden + # with TRITONSERVER_IPADDR envvar + self.server_address_ = ( + os.environ.get("TRITONSERVER_IPADDR", "localhost") + ":8001" + ) + + # Prepare input and expected output based on the model and + # the infer sequence sent for testing. If the test is to be extended + # for different sequence and model, then proper grouping should be added + self.model_name_ = "custom_sequence_int32_timeout" + self.tensor_data_ = np.ones(shape=[1, 1], dtype=np.int32) + self.inputs_ = [grpcclient.InferInput("INPUT0", [1, 1], "INT32")] + self.inputs_[0].set_data_from_numpy(self.tensor_data_) + self.expected_out_seq_ = [ + ("OUTPUT0", self.tensor_data_), + ("OUTPUT0", self.tensor_data_), + ("OUTPUT0", self.tensor_data_), + ] + + def send_sequence_with_timeout( + self, seq_id, callback, timeout_us=3000000, request_pause_sec=0 + ): + with grpcclient.InferenceServerClient(self.server_address_) as triton_client: + triton_client.start_stream(callback=callback) + triton_client.async_stream_infer( + self.model_name_, + self.inputs_, + sequence_id=seq_id, + sequence_start=True, + timeout=timeout_us, + ) + if request_pause_sec != 0: + time.sleep(request_pause_sec) + triton_client.async_stream_infer( + self.model_name_, self.inputs_, sequence_id=seq_id, timeout=timeout_us + ) + if request_pause_sec != 0: + time.sleep(request_pause_sec) + triton_client.async_stream_infer( + self.model_name_, + self.inputs_, + sequence_id=seq_id, + sequence_end=True, + timeout=timeout_us, + ) + + def test_request_timeout(self): + # Test long running model that receives requests with shorter timeout, + # expect the timeout will only be expired on backlog sequence and reject + # all requests of the sequence once expired. + # Sending two sequences while the model can only process one sequence + # at a time. Each model execution takes 5 second and all requests have + # 3 second timeout, so the second sequence will be rejected. + + # correlation ID is 1-index + seq1_res = [] + seq2_res = [] + seq1_callback = lambda result, error: seq1_res.append((result, error)) + seq2_callback = lambda result, error: seq2_res.append((result, error)) + + # send sequence with 1s interval to ensure processing order + threads = [] + threads.append( + threading.Thread( + target=self.send_sequence_with_timeout, args=(1, seq1_callback) + ) + ) + threads.append( + threading.Thread( + target=self.send_sequence_with_timeout, args=(2, seq2_callback) + ) + ) + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + + for idx in range(len(seq1_res)): + result, error = seq1_res[idx] + self.assertIsNone( + error, + "Expect successful inference for sequence 1 requests, got error: {}".format( + error + ), + ) + out = result.as_numpy(self.expected_out_seq_[idx][0]) + expected_out = self.expected_out_seq_[idx][1] + np.testing.assert_allclose( + out, + expected_out, + err_msg="Unexpected output tensor: expect {}, got {}".format( + expected_out, out + ), + ) + + for _, error in seq2_res: + self.assertIsNotNone(error, "Expect error for sequence 2 requests") + with self.assertRaisesRegex( + InferenceServerException, + "timeout of the corresponding sequence has been expired", + msg="Unexpected error: {}".format(error), + ): + raise error + + def test_send_request_after_timeout(self): + # Similar to test_request_timeout, but the sequence to be timed out + # will send the last request after the sequence has been timed out, + # and expecting server to return error regarding sending request of + # an untracked sequence + + seq1_res = [] + seq2_res = [] + seq1_callback = lambda result, error: seq1_res.append((result, error)) + seq2_callback = lambda result, error: seq2_res.append((result, error)) + + threads = [] + threads.append( + threading.Thread( + target=self.send_sequence_with_timeout, args=(1, seq1_callback) + ) + ) + # Each request will be sent with a pause, so the third request + # will be sent after the sequence has been timed out + threads.append( + threading.Thread( + target=self.send_sequence_with_timeout, + args=(2, seq2_callback), + kwargs={"request_pause_sec": 2}, + ) + ) + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + + # Check error message of the last request and the rest + # separately + for _, error in seq2_res[0:-1]: + self.assertIsNotNone(error, "Expect error for sequence 2 requests") + with self.assertRaisesRegex( + InferenceServerException, + "timeout of the corresponding sequence has been expired", + msg="Unexpected error: {}".format(error), + ): + raise error + _, last_err = seq2_res[-1] + self.assertIsNotNone(last_err, "Expect error for sequence 2 requests") + with self.assertRaisesRegex( + InferenceServerException, + "must specify the START flag on the first request", + msg="Unexpected error: {}".format(last_err), + ): + raise last_err + + +class SequenceBatcherPreserveOrderingTest(su.SequenceBatcherTestUtil): + def setUp(self): + super().setUp() + # By default, find tritonserver on "localhost", but can be overridden + # with TRITONSERVER_IPADDR envvar + self.server_address_ = ( + os.environ.get("TRITONSERVER_IPADDR", "localhost") + ":8001" + ) + + # Prepare input and expected output based on the model and + # the infer sequence sent for testing. If the test is to be extended + # for different sequence and model, then proper grouping should be added + self.model_name_ = "sequence_py" + self.tensor_data_ = np.ones(shape=[1, 1], dtype=np.int32) + self.inputs_ = [grpcclient.InferInput("INPUT0", [1, 1], "INT32")] + self.inputs_[0].set_data_from_numpy(self.tensor_data_) + self.triton_client = grpcclient.InferenceServerClient(self.server_address_) + + # Atomic request ID for multi-threaded inference + self.request_id_lock = threading.Lock() + self.request_id = 1 + + def send_sequence(self, seq_id, seq_id_map, req_id_map): + if seq_id not in seq_id_map: + seq_id_map[seq_id] = [] + + start, middle, end = (True, False), (False, False), (False, True) + # Send sequence with 1 start, 1 middle, and 1 end request + seq_flags = [start, middle, end] + for start_flag, end_flag in seq_flags: + # Introduce random sleep to better interweave requests from different sequences + time.sleep(random.uniform(0.0, 1.0)) + + # Serialize sending requests to ensure ordered request IDs + with self.request_id_lock: + req_id = self.request_id + self.request_id += 1 + + # Store metadata to validate results later + req_id_map[req_id] = seq_id + seq_id_map[seq_id].append(req_id) + + self.triton_client.async_stream_infer( + self.model_name_, + self.inputs_, + sequence_id=seq_id, + sequence_start=start_flag, + sequence_end=end_flag, + timeout=None, + request_id=str(req_id), + ) + + def _test_sequence_ordering(self, preserve_ordering, decoupled): + # 1. Send a few grpc streaming sequence requests to the model. + # 2. With grpc streaming, the model should receive the requests in + # the same order they are sent from client, and the client should + # receive the responses in the same order sent back by the + # model/server. With sequence scheduler, the requests for each sequence should be routed to the same model + # instance, and no two requests from the same sequence should + # get batched together. + # 3. With preserve_ordering=False, we may get the responses back in a different + # order than the requests, but with grpc streaming we should still expect responses for each sequence to be ordered. + # 4. Assert that the sequence values are ordered, and that the response IDs per sequence are ordered + class SequenceResult: + def __init__(self, seq_id, result, request_id): + self.seq_id = seq_id + self.result = result + self.request_id = int(request_id) + + def full_callback(sequence_dict, sequence_list, result, error): + # We expect no model errors for this test + if error: + self.assertTrue(False, error) + + # Gather all the necessary metadata for validation + request_id = int(result.get_response().id) + sequence_id = request_id_map[request_id] + # Overall list of results in the order received, regardless of sequence ID + sequence_list.append(SequenceResult(sequence_id, result, request_id)) + # Ordered results organized by their seq IDs + sequence_dict[sequence_id].append(result) + + # Store ordered list in which responses are received by client + sequence_list = [] + # Store mapping of sequence ID to response results + sequence_dict = {} + # Store mapping of sequence ID to request IDs and vice versa + sequence_id_map = {} + request_id_map = {} + + # Start stream + seq_callback = partial(full_callback, sequence_dict, sequence_list) + self.triton_client.start_stream(callback=seq_callback) + + # Send N sequences concurrently + threads = [] + num_sequences = 10 + for i in range(num_sequences): + # Sequence IDs are 1-indexed + sequence_id = i + 1 + # Add a result list and callback for each sequence + sequence_dict[sequence_id] = [] + threads.append( + threading.Thread( + target=self.send_sequence, + args=(sequence_id, sequence_id_map, request_id_map), + ) + ) + + # Start all sequence threads + for t in threads: + t.start() + + # Wait for threads to return + for t in threads: + t.join() + + # Block until all requests are completed + self.triton_client.stop_stream() + + # Make sure some inferences occurred and metadata was collected + self.assertGreater(len(sequence_dict), 0) + self.assertGreater(len(sequence_list), 0) + + # Validate model results are sorted per sequence ID (model specific logic) + print(f"=== {preserve_ordering=} {decoupled=} ===") + print("Outputs per Sequence:") + for seq_id, sequence in sequence_dict.items(): + seq_outputs = [ + result.as_numpy("OUTPUT0").flatten().tolist() for result in sequence + ] + print(f"{seq_id}: {seq_outputs}") + self.assertEqual(seq_outputs, sorted(seq_outputs)) + + # Validate request/response IDs for each response in a sequence is sorted + # This should be true regardless of preserve_ordering or not + print("Request IDs per Sequence:") + for seq_id in sequence_id_map: + per_seq_request_ids = sequence_id_map[seq_id] + print(f"{seq_id}: {per_seq_request_ids}") + self.assertEqual(per_seq_request_ids, sorted(per_seq_request_ids)) + + # Validate results are sorted in request order if preserve_ordering is True + if preserve_ordering: + request_ids = [s.request_id for s in sequence_list] + print(f"Request IDs overall:\n{request_ids}") + sequence_ids = [s.seq_id for s in sequence_list] + print(f"Sequence IDs overall:\n{sequence_ids}") + self.assertEqual(request_ids, sorted(request_ids)) + + # Assert some dynamic batching of requests was done + stats = self.triton_client.get_inference_statistics( + model_name=self.model_name_, headers={}, as_json=True + ) + model_stats = stats["model_stats"][0] + self.assertEqual(model_stats["name"], self.model_name_) + self.assertLess( + int(model_stats["execution_count"]), int(model_stats["inference_count"]) + ) + + def test_sequence_with_preserve_ordering(self): + self.model_name_ = "seqpy_preserve_ordering_nondecoupled" + self._test_sequence_ordering(preserve_ordering=True, decoupled=False) + + def test_sequence_without_preserve_ordering(self): + self.model_name_ = "seqpy_no_preserve_ordering_nondecoupled" + self._test_sequence_ordering(preserve_ordering=False, decoupled=False) + + # FIXME [DLIS-5280]: This may fail for decoupled models if writes to GRPC + # stream are done out of order in server, so disable test for now. + # def test_sequence_with_preserve_ordering_decoupled(self): + # self.model_name_ = "seqpy_preserve_ordering_decoupled" + # self._test_sequence_ordering(preserve_ordering=True, decoupled=True) + + # FIXME [DLIS-5280] + # def test_sequence_without_preserve_ordering_decoupled(self): + # self.model_name_ = "seqpy_no_preserve_ordering_decoupled" + # self._test_sequence_ordering(preserve_ordering=False, decoupled=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh new file mode 100755 index 0000000000..ac34458b4e --- /dev/null +++ b/qa/L0_sequence_batcher/test.sh @@ -0,0 +1,929 @@ +#!/bin/bash +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' + +# Must run on a single device or else the TRITONSERVER_DELAY_SCHEDULER +# can fail when the requests are distributed to multiple devices. +ldconfig || true + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +BATCHER_TEST=sequence_batcher_test.py + +if [ -z "$TEST_SYSTEM_SHARED_MEMORY" ]; then + TEST_SYSTEM_SHARED_MEMORY="0" +fi + +if [ -z "$TEST_CUDA_SHARED_MEMORY" ]; then + TEST_CUDA_SHARED_MEMORY="0" +fi + +if [ -z "$TEST_VALGRIND" ]; then + TEST_VALGRIND="0" +fi + +if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK=/usr/bin/valgrind + LEAKCHECK_ARGS_BASE="--leak-check=full --show-leak-kinds=definite --max-threads=3000" + SERVER_TIMEOUT=3600 + rm -f *.valgrind.log + + # Shortened tests due valgrind overhead + MODEL_TRIALS="0 v" + NO_DELAY_TESTS="test_simple_sequence \ + test_no_sequence_start \ + test_batch_size" + DELAY_TESTS="test_backlog_fill_no_end \ + test_backlog_sequence_timeout \ + test_ragged_batch" + QUEUE_DELAY_TESTS="test_queue_delay_full_min_util" +fi + +if [ -z "$TEST_JETSON" ]; then + TEST_JETSON="0" +fi + +# Shortened tests due to jetson slowdown +if [ "$TEST_JETSON" -eq 1 ]; then + MODEL_TRIALS="0 v" +fi + +TF_VERSION=${TF_VERSION:=2} + +# On windows the paths invoked by the script (running in WSL) must use +# /mnt/c when needed but the paths on the tritonserver command-line +# must be C:/ style. +WINDOWS=0 +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then + MODELDIR=${MODELDIR:=C:/models} + DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} + BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} + SERVER=${SERVER:=/mnt/c/tritonserver/bin/tritonserver.exe} + export WSLENV=$WSLENV:TRITONSERVER_DELAY_SCHEDULER:TRITONSERVER_BACKLOG_DELAY_SCHEDULER + WINDOWS=1 +else + MODELDIR=${MODELDIR:=`pwd`} + DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} + SERVER=${TRITON_DIR}/bin/tritonserver + BACKEND_DIR=${TRITON_DIR}/backends + + # PyTorch on SBSA requires libgomp to be loaded first. See the following + # GitHub issue for more information: + # https://github.com/pytorch/pytorch/issues/2575 + arch=`uname -m` + if [ $arch = "aarch64" ]; then + SERVER_LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libgomp.so.1 + fi +fi + +SERVER_ARGS_EXTRA="--backend-directory=${BACKEND_DIR} --backend-config=tensorflow,version=${TF_VERSION} --log-verbose=1" + +source ../common/util.sh + +RET=0 + +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="graphdef savedmodel onnx plan libtorch custom python"} +export BACKENDS + +# If MODEL_TRIALS not specified set to 0 1 2 4 v +MODEL_TRIALS=${MODEL_TRIALS:="0 1 2 4 v"} + +# Basic sequence batcher tests +NO_DELAY_TESTS=${NO_DELAY_TESTS:="test_simple_sequence \ + test_length1_sequence \ + test_batch_size \ + test_no_sequence_start \ + test_no_sequence_start2 \ + test_no_sequence_end \ + test_no_correlation_id"} + +# Tests that use scheduler delay +DELAY_TESTS=${DELAY_TESTS:="test_backlog_fill \ + test_backlog_fill_no_end \ + test_backlog_same_correlation_id \ + test_backlog_same_correlation_id_no_end \ + test_backlog_sequence_timeout \ + test_half_batch \ + test_skip_batch \ + test_full_batch \ + test_ragged_batch \ + test_backlog"} + +# Tests on queue delay +QUEUE_DELAY_TESTS=${QUEUE_DELAY_TESTS:="test_queue_delay_no_min_util \ + test_queue_delay_half_min_util \ + test_queue_delay_full_min_util"} + +# If ENSEMBLES not specified, set to 1 +ENSEMBLES=${ENSEMBLES:="1"} +export ENSEMBLES + +# If IMPLICIT_STATE not specified, set to 0 +IMPLICIT_STATE=${IMPLICIT_STATE:="0"} +export IMPLICIT_STATE + +# If INITIAL_STATE_FILE is not specified, set to 0 +INITIAL_STATE_FILE=${INITIAL_STATE_FILE:="0"} +export INITIAL_STATE_FILE + +# If INITIAL_STATE_ZERO is not specified, set to 0 +INITIAL_STATE_ZERO=${INITIAL_STATE_ZERO:="0"} +export INITIAL_STATE_ZERO + +# If USE_SINGLE_BUFFER is not specified, set to 0 +USE_SINGLE_BUFFER=${USE_SINGLE_BUFFER:="0"} +export USE_SINGLE_BUFFER + +# Setup non-variable-size model repositories. The same models are in each +# repository but they are configured as: +# models0 - four instances with non-batching model +# models1 - one instance with batch-size 4 +# models2 - two instances with batch-size 2 +# models4 - four instances with batch-size 1 +rm -fr *.log models{0,1,2,4} queue_delay_models && mkdir models{0,1,2,4} queue_delay_models + +# Search BACKENDS to determine if a backend should be tested +function should_test_backend() { + local target_backend=$1 + if [[ $(echo "${BACKENDS[@]}" | grep -c "${target_backend}") -ne 0 ]]; then + echo "true" + return + fi + echo "false" +} + +# Get the datatype to use based on the backend +function get_datatype () { + local dtype="int32 bool" + if [[ $1 == "plan" ]]; then + dtype="float32" + elif [[ $1 == "savedmodel" ]]; then + dtype="float32 bool" + elif [[ $1 == "graphdef" ]]; then + dtype="object bool int32" + fi + + # Add type string to the onnx model tests only for implicit state. + if [ "$IMPLICIT_STATE" == "1" ]; then + if [[ $1 == "onnx" ]]; then + dtype="object int32 bool" + fi + if [[ $1 == "libtorch" ]]; then + dtype="object int32 bool" + fi + fi + echo $dtype +} + +# Modify corresponding onnx config.pbtxt to create python config.pbtxt +function generate_python_models () { + model_path=$1 + dest_dir=$2 + onnx_model=$(echo ${model_path//python/onnx}) + python_model=$(basename $model_path) + mkdir -p $dest_dir/$python_model/1/ + # for emsemble models keep "platform: ensemble" + if [[ "$model_path" == *"ensemble_model"* ]]; then + cat $onnx_model/config.pbtxt | sed 's/onnx/python/g' > $dest_dir/$python_model/config.pbtxt + else + cat $onnx_model/config.pbtxt | sed 's/platform:.*/backend:\ "python"/g' | sed 's/onnx/python/g' > $dest_dir/$python_model/config.pbtxt + cp ../python_models/sequence_int32/model.py $dest_dir/$python_model/1/ + fi +} + +if [[ "$INITIAL_STATE_ZERO" == "1" && "$INITIAL_STATE_FILE" == "1" ]]; then + echo -e "\n***\n*** 'INITIAL_STATE_ZERO' and 'INITIAL_STATE_FILE' can't be enabled simultaneously. \n***" + exit 1 +fi + +FIXED_MODEL_REPOSITORY='' +VAR_MODEL_REPOSITORY='' +if [ "$IMPLICIT_STATE" == "1" ]; then + if [[ "$INITIAL_STATE_ZERO" == "0" && "$INITIAL_STATE_FILE" == "0" ]]; then + FIXED_MODEL_REPOSITORY="qa_sequence_implicit_model_repository" + VAR_MODEL_REPOSITORY="qa_variable_sequence_implicit_model_repository" + else + FIXED_MODEL_REPOSITORY="qa_sequence_initial_state_implicit_model_repository" + VAR_MODEL_REPOSITORY="qa_variable_sequence_initial_state_implicit_model_repository" + fi +else + FIXED_MODEL_REPOSITORY="qa_sequence_model_repository" + VAR_MODEL_REPOSITORY="qa_variable_sequence_model_repository" +fi + +MODELS="" +PYTHON_MODELS="" +for BACKEND in $BACKENDS; do + if [[ $BACKEND == "custom" ]]; then + MODELS="$MODELS ../custom_models/custom_sequence_int32" + else + DTYPES=$(get_datatype $BACKEND) + + for DTYPE in $DTYPES; do + MODELS="$MODELS $DATADIR/$FIXED_MODEL_REPOSITORY/${BACKEND}_sequence_${DTYPE}" + done + + if [ "$ENSEMBLES" == "1" ]; then + for DTYPE in $DTYPES; do + # We don't generate ensemble models for bool data type. + if [[ $DTYPE != "bool" ]]; then + if [ "$BACKEND" == "python" ]; then + PYTHON_MODELS="$DATADIR/qa_ensemble_model_repository/$FIXED_MODEL_REPOSITORY/*_onnx_sequence_${DTYPE}" + TMP=$(echo $PYTHON_MODELS) + MODELS="$MODELS ${TMP//onnx/python}" + else + MODELS="$MODELS $DATADIR/qa_ensemble_model_repository/$FIXED_MODEL_REPOSITORY/*_${BACKEND}_sequence_${DTYPE}" + fi + fi + done + fi + fi +done + +if [ "$INITIAL_STATE_FILE" == "1" ]; then + # Create the input_state_data file. + rm -rf input_state_data + echo -n -e "\\x64\\x00\\x00\\x00" > input_state_data +fi + +for MODEL in $MODELS; do + if [[ ! "$TEST_VALGRIND" -eq 1 ]]; then + # Skip libtorch string models + if [[ "$MODEL" =~ .*"libtorch".*"object".* ]]; then + continue + fi + if [[ "$MODEL" =~ .*"python".* ]]; then + generate_python_models "$MODEL" "models1" + else + cp -r $MODEL models1/. + fi + (cd models1/$(basename $MODEL) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" config.pbtxt) + + # Skip libtorch string models + if [[ "$MODEL" =~ .*"libtorch".*"object".* ]]; then + continue + fi + + if [[ "$MODEL" =~ .*"python".* ]]; then + generate_python_models "$MODEL" "models2" + else + cp -r $MODEL models2/. + fi + (cd models2/$(basename $MODEL) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 2/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 2/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 2/" config.pbtxt) + + if [[ "$MODEL" =~ .*"python".* ]]; then + generate_python_models "$MODEL" "models4" + else + cp -r $MODEL models4/. + fi + (cd models4/$(basename $MODEL) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 4/" config.pbtxt) + + # Duplicate the models for different delay settings + if [[ "$MODEL" =~ .*"python".* ]]; then + generate_python_models "$MODEL" "queue_delay_models" + else + cp -r $MODEL queue_delay_models/. + fi + (cd queue_delay_models/$(basename $MODEL) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/sequence_batching {/sequence_batching {\\ndirect {\\nmax_queue_delay_microseconds: 3000000\\nminimum_slot_utilization: 0\\n}/" config.pbtxt) + + cp -r queue_delay_models/$(basename $MODEL) queue_delay_models/$(basename $MODEL)_half && \ + (cd queue_delay_models/$(basename $MODEL)_half && \ + sed -i "s/$(basename $MODEL)/$(basename $MODEL)_half/" config.pbtxt && \ + sed -i "s/minimum_slot_utilization: 0/minimum_slot_utilization: 0.5/" config.pbtxt) + cp -r queue_delay_models/$(basename $MODEL) queue_delay_models/$(basename $MODEL)_full && \ + (cd queue_delay_models/$(basename $MODEL)_full && \ + sed -i "s/$(basename $MODEL)/$(basename $MODEL)_full/" config.pbtxt && \ + sed -i "s/minimum_slot_utilization: 0/minimum_slot_utilization: 1/" config.pbtxt) + + # TODO: Enable single state buffer testing for sequence batcher + # if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then + # SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true" + # (cd models0/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd models1/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd models2/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd models4/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd queue_delay_models/$(basename $MODEL)_full && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd queue_delay_models/$(basename $MODEL)_half && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # fi + else + cp -r $MODEL queue_delay_models/$(basename $MODEL)_full && \ + (cd queue_delay_models/$(basename $MODEL)_full && \ + sed -i "s/$(basename $MODEL)/$(basename $MODEL)_full/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/sequence_batching {/sequence_batching {\\ndirect {\\nmax_queue_delay_microseconds: 3000000\\nminimum_slot_utilization: 0\\n}/" config.pbtxt && \ + sed -i "s/minimum_slot_utilization: 0/minimum_slot_utilization: 1/" config.pbtxt) + fi +done + +# Adjust the model repository for reading initial state for implicit state from file +if [ "$INITIAL_STATE_FILE" == "1" ]; then + for MODEL in $MODELS; do + if [[ ! "$TEST_VALGRIND" -eq 1 ]]; then + mkdir -p models1/$(basename $MODEL)/initial_state/ && cp input_state_data models1/$(basename $MODEL)/initial_state/ && \ + (cd models1/$(basename $MODEL) && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + + mkdir -p models2/$(basename $MODEL)/initial_state/ && cp input_state_data models2/$(basename $MODEL)/initial_state/ && \ + (cd models2/$(basename $MODEL) && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + + mkdir -p models4/$(basename $MODEL)/initial_state/ && cp input_state_data models4/$(basename $MODEL)/initial_state/ && \ + (cd models4/$(basename $MODEL) && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + + mkdir -p queue_delay_models/$(basename $MODEL)/initial_state/ && cp input_state_data queue_delay_models/$(basename $MODEL)/initial_state/ && \ + (cd queue_delay_models/$(basename $MODEL) && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + + mkdir -p queue_delay_models/$(basename $MODEL)_half/initial_state/ && cp input_state_data queue_delay_models/$(basename $MODEL)_half/initial_state/ && \ + (cd queue_delay_models/$(basename $MODEL)_half && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + + mkdir -p queue_delay_models/$(basename $MODEL)_full/initial_state/ && cp input_state_data queue_delay_models/$(basename $MODEL)_full/initial_state/ && \ + (cd queue_delay_models/$(basename $MODEL)_full && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + else + mkdir -p queue_delay_models/$(basename $MODEL)_full/initial_state/ && cp input_state_data queue_delay_models/$(basename $MODEL)_full/initial_state/ && \ + (cd queue_delay_models/$(basename $MODEL)_full && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + fi + done +fi + +MODELS="" +PYTHON_MODELS="" +for BACKEND in $BACKENDS; do + if [[ $BACKEND == "custom" ]]; then + MODELS="$MODELS ../custom_models/custom_sequence_int32" + else + DTYPES=$(get_datatype $BACKEND) + for DTYPE in $DTYPES; do + MODELS="$MODELS $DATADIR/$FIXED_MODEL_REPOSITORY/${BACKEND}_nobatch_sequence_${DTYPE}" + done + + if [ "$ENSEMBLES" == "1" ]; then + for DTYPE in $DTYPES; do + # We don't generate ensemble models for bool data type. + if [[ $DTYPE != "bool" ]]; then + if [ "$BACKEND" == "python" ]; then + PYTHON_MODELS="$DATADIR/qa_ensemble_model_repository/$FIXED_MODEL_REPOSITORY/*_onnx_nobatch_sequence_${DTYPE}" + TMP=$(echo $PYTHON_MODELS) + MODELS="$MODELS ${TMP//onnx/python}" + else + MODELS="$MODELS $DATADIR/qa_ensemble_model_repository/$FIXED_MODEL_REPOSITORY/*_${BACKEND}_nobatch_sequence_${DTYPE}" + fi + fi + done + + fi + fi +done + +for MODEL in $MODELS; do + if [[ "$MODEL" =~ .*"python".* ]]; then + generate_python_models "$MODEL" "models0" + else + cp -r $MODEL models0/. + fi + (cd models0/$(basename $MODEL) && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 4/" config.pbtxt) + + if [ "$INITIAL_STATE_FILE" == "1" ]; then + mkdir -p models0/$(basename $MODEL)/initial_state/ && cp input_state_data models0/$(basename $MODEL)/initial_state/ && \ + (cd models0/$(basename $MODEL) && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + fi +done + +# modelsv - one instance with batch-size 4 +rm -fr modelsv && mkdir modelsv + +MODELS="" +PYTHON_MODELS="" +for BACKEND in $BACKENDS; do + if [[ $BACKEND == "custom" ]]; then + MODELS="$MODELS ../custom_models/custom_sequence_int32" + else + DTYPES=$(get_datatype $BACKEND) + for DTYPE in $DTYPES; do + MODELS="$MODELS $DATADIR/${VAR_MODEL_REPOSITORY}/${BACKEND}_sequence_${DTYPE}" + done + + if [ "$ENSEMBLES" == "1" ]; then + for DTYPE in $DTYPES; do + # We don't generate ensemble models for bool data type. + if [[ $DTYPE != "bool" ]]; then + if [ "$BACKEND" == "python" ]; then + PYTHON_MODELS="$DATADIR/qa_ensemble_model_repository/$FIXED_MODEL_REPOSITORY/*_onnx_sequence_${DTYPE}" + TMP=$(echo $PYTHON_MODELS) + MODELS="$MODELS ${TMP//onnx/python}" + else + MODELS="$MODELS $DATADIR/qa_ensemble_model_repository/${VAR_MODEL_REPOSITORY}/*_${BACKEND}_sequence_${DTYPE}" + fi + fi + done + fi + fi +done + +for MODEL in $MODELS; do + # Skip libtorch string models + if [[ "$MODEL" =~ .*"libtorch".*"object".* ]]; then + continue + fi + if [[ "$MODEL" =~ .*"python".* ]]; then + generate_python_models "$MODEL" "modelsv" + else + cp -r $MODEL modelsv/. + fi + (cd modelsv/$(basename $MODEL) && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" config.pbtxt) + + if [ "$INITIAL_STATE_FILE" == "1" ]; then + mkdir -p modelsv/$(basename $MODEL)/initial_state/ && cp input_state_data modelsv/$(basename $MODEL)/initial_state/ && \ + (cd modelsv/$(basename $MODEL) && \ + sed -i "s/zero_data.*/data_file:\"input_state_data\"/" config.pbtxt) + fi +done + +# Same test work on all models since they all have same total number +# of batch slots. +for model_trial in $MODEL_TRIALS; do + export NO_BATCHING=1 && + [[ "$model_trial" != "0" ]] && export NO_BATCHING=0 + export MODEL_INSTANCES=1 && + [[ "$model_trial" != "v" ]] && export MODEL_INSTANCES=4 && + [[ "$model_trial" != "0" ]] && export MODEL_INSTANCES=$model_trial + + MODEL_PATH=models${model_trial} + + if [ "$ENSEMBLES" == "1" ]; then + cp -r $DATADIR/qa_ensemble_model_repository/${FIXED_MODEL_REPOSITORY}/nop_* `pwd`/$MODEL_PATH/. + create_nop_version_dir `pwd`/$MODEL_PATH + # Must load identity backend on GPU to avoid cuda init delay during 1st run + for NOP_MODEL in `pwd`/$MODEL_PATH/nop_*; do + (cd $NOP_MODEL && sed -i "s/kind: KIND_CPU/kind: KIND_GPU/" config.pbtxt) + done + fi + + # Need to launch the server for each test so that the model status + # is reset (which is used to make sure the correct batch size was + # used for execution). Test everything with fixed-tensor-size + # models and variable-tensor-size models. + export BATCHER_TYPE="VARIABLE" && + [[ "$model_trial" != "v" ]] && export BATCHER_TYPE="FIXED" + + for i in $NO_DELAY_TESTS; do + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.$MODEL_PATH.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$MODEL_PATH.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, repository $MODEL_PATH" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST SequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e + done + + # Tests that require TRITONSERVER_DELAY_SCHEDULER so that the + # scheduler is delayed and requests can collect in the queue. + for i in $DELAY_TESTS; do + export TRITONSERVER_BACKLOG_DELAY_SCHEDULER=3 && + [[ "$i" != "test_backlog_fill_no_end" ]] && export TRITONSERVER_BACKLOG_DELAY_SCHEDULER=2 && + [[ "$i" != "test_backlog_fill" ]] && + [[ "$i" != "test_backlog_same_correlation_id" ]] && export TRITONSERVER_BACKLOG_DELAY_SCHEDULER=0 + export TRITONSERVER_DELAY_SCHEDULER=10 && + [[ "$i" != "test_backlog_fill_no_end" ]] && + [[ "$i" != "test_backlog_fill" ]] && export TRITONSERVER_DELAY_SCHEDULER=16 && + [[ "$i" != "test_backlog_same_correlation_id_no_end" ]] && export TRITONSERVER_DELAY_SCHEDULER=8 && + [[ "$i" != "test_half_batch" ]] && export TRITONSERVER_DELAY_SCHEDULER=4 && + [[ "$i" != "test_backlog_sequence_timeout" ]] && export TRITONSERVER_DELAY_SCHEDULER=12 + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.$MODEL_PATH.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$MODEL_PATH.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, repository $MODEL_PATH" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST SequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_DELAY_SCHEDULER + unset TRITONSERVER_BACKLOG_DELAY_SCHEDULER + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e + done +done + +# ragged models +if [[ $BACKENDS == *"custom"* ]]; then + rm -fr ragged_models && mkdir ragged_models + cp -r ../custom_models/custom_sequence_int32 ragged_models/. + (cd ragged_models/custom_sequence_int32 && \ + sed -i "s/name:.*\"INPUT\"/name: \"INPUT\"\\nallow_ragged_batch: true/" config.pbtxt) + + export NO_BATCHING=0 + export MODEL_INSTANCES=1 + export BATCHER_TYPE="FIXED" + MODEL_PATH=ragged_models + + # Need to launch the server for each test so that the model status + # is reset (which is used to make sure the correct batch size was + # used for execution). Test everything with fixed-tensor-size + # models and variable-tensor-size models. + for i in test_ragged_batch_allowed ; do + export TRITONSERVER_BACKLOG_DELAY_SCHEDULER=0 + export TRITONSERVER_DELAY_SCHEDULER=12 + + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.$MODEL_PATH.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$MODEL_PATH.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, repository $MODEL_PATH" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST SequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_DELAY_SCHEDULER + unset TRITONSERVER_BACKLOG_DELAY_SCHEDULER + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e + done +fi + +# max queue delay +MODEL_PATH=queue_delay_models +# remove ensemble models from the test model repo +rm -rf queue_delay_models/simple_* queue_delay_models/fan_* queue_delay_models/sequence_* +for i in $QUEUE_DELAY_TESTS ; do + export NO_BATCHING=0 + export TRITONSERVER_BACKLOG_DELAY_SCHEDULER=0 + export TRITONSERVER_DELAY_SCHEDULER=2 + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.$MODEL_PATH.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$MODEL_PATH.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, repository $MODEL_PATH" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST SequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_DELAY_SCHEDULER + unset TRITONSERVER_BACKLOG_DELAY_SCHEDULER + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e +done + +# Test request timeout with sequence batcher +# only run the test outside shared memory setting as +# shared memory feature is irrelevant +if [ "$TEST_SYSTEM_SHARED_MEMORY" -ne 1 ] && [ "$TEST_CUDA_SHARED_MEMORY" -ne 1 ]; then + export NO_BATCHING=0 + export MODEL_INSTANCES=1 + export BATCHER_TYPE="FIXED" + + TEST_CASE=SequenceBatcherRequestTimeoutTest + MODEL_PATH=request_timeout_models + mkdir -p ${MODEL_PATH}/custom_sequence_int32_timeout/1 + + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$TEST_CASE.$MODEL_PATH.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$MODEL_PATH.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $TEST_CASE, repository $MODEL_PATH" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST $TEST_CASE >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $TEST_CASE Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $TEST_CASE Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e +fi + +### Start Preserve Ordering Tests ### + +# FIXME: Test only supported on windows currently due to use of python backend models. +# Now that Windows supports the PYBE, we should check that this tests works once Windows +# CI is stable. + +# These subtests use python models. They should not be executed if 'python' is not one +# of the backends under test. +if [[ $(should_test_backend "python") == "true" && !( -v WSL_DISTRO_NAME || -v MSYSTEM )]]; then + # Test preserve ordering true/false and decoupled/non-decoupled + TEST_CASE=SequenceBatcherPreserveOrderingTest + MODEL_PATH=preserve_ordering_models + BASE_MODEL="../python_models/sequence_py" + rm -rf ${MODEL_PATH} + + # FIXME [DLIS-5280]: This may fail for decoupled models if writes to GRPC + # stream are done out of order in server, so decoupled tests are disabled. + MODES="decoupled nondecoupled" + for mode in $MODES; do + NO_PRESERVE="${MODEL_PATH}/seqpy_no_preserve_ordering_${mode}" + mkdir -p ${NO_PRESERVE}/1 + cp ${BASE_MODEL}/config.pbtxt ${NO_PRESERVE} + cp ${BASE_MODEL}/model.py ${NO_PRESERVE}/1 + + PRESERVE="${MODEL_PATH}/seqpy_preserve_ordering_${mode}" + cp -r ${NO_PRESERVE} ${PRESERVE} + sed -i "s/^preserve_ordering: False/preserve_ordering: True/" ${PRESERVE}/config.pbtxt + + if [ ${mode} == "decoupled" ]; then + echo -e "\nmodel_transaction_policy { decoupled: true }" >> ${NO_PRESERVE}/config.pbtxt + echo -e "\nmodel_transaction_policy { decoupled: true }" >> ${PRESERVE}/config.pbtxt + fi + done + + SERVER_ARGS="--model-repository=$MODELDIR/$MODEL_PATH ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$TEST_CASE.$MODEL_PATH.server.log" + + if [ "$TEST_VALGRIND" -eq 1 ]; then + LEAKCHECK_LOG="./$i.$MODEL_PATH.valgrind.log" + LEAKCHECK_ARGS="$LEAKCHECK_ARGS_BASE --log-file=$LEAKCHECK_LOG" + run_server_leakcheck + else + run_server + fi + + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $TEST_CASE, repository $MODEL_PATH" >>$CLIENT_LOG + + set +e + python3 $BATCHER_TEST $TEST_CASE >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $TEST_CASE Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $TEST_CASE Failed\n***" + RET=1 + else + # 2 for preserve_ordering = True/False + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill_server + + set +e + if [ "$TEST_VALGRIND" -eq 1 ]; then + python3 ../common/check_valgrind_log.py -f $LEAKCHECK_LOG + if [ $? -ne 0 ]; then + RET=1 + fi + fi + set -e +fi + +### End Preserve Ordering Tests ### + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py new file mode 100755 index 0000000000..15f16da352 --- /dev/null +++ b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import threading +import time +import unittest + +import numpy as np +import sequence_util as su +import test_util as tu + +_test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +_test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) + +_no_batching = int(os.environ["NO_BATCHING"]) == 1 +_model_instances = int(os.environ["MODEL_INSTANCES"]) + +if _no_batching: + _trials = ("savedmodel_nobatch", "graphdef_nobatch", "plan_nobatch", "onnx_nobatch") +else: + _trials = ("savedmodel", "graphdef", "plan", "onnx") + +_protocols = ("http", "grpc") +_max_sequence_idle_ms = 5000 + + +class SequenceCorrIDBatcherTest(su.SequenceBatcherTestUtil): + def get_datatype(self, trial): + return np.int32 + + def get_expected_result(self, expected_result, corrid, value, trial, flag_str=None): + # Adjust the expected_result for models that + # could not implement the full accumulator. See + # qa/common/gen_qa_dyna_sequence_models.py for more + # information. + if ( + (("nobatch" not in trial) and ("custom" not in trial)) + or ("graphdef" in trial) + or ("plan" in trial) + or ("onnx" in trial) + ) or ("libtorch" in trial): + expected_result = value + if flag_str is not None: + if "start" in flag_str: + expected_result += 1 + if "end" in flag_str: + expected_result += corrid + return expected_result + + def test_skip_batch(self): + # Test model instances together are configured with + # total-batch-size 4. Send four sequences in parallel where + # two sequences have shorter length so that padding must be + # applied correctly for the longer sequences. + for trial in _trials: + self.clear_deferred_exceptions() + dtype = self.get_datatype(trial) + precreated_shm0_handles = self.precreate_register_regions((1, 3), dtype, 0) + precreated_shm1_handles = self.precreate_register_regions( + (11, 12, 13, 14), dtype, 1 + ) + precreated_shm2_handles = self.precreate_register_regions( + (111, 113), dtype, 2 + ) + precreated_shm3_handles = self.precreate_register_regions( + (1111, 1112, 1113, 1114), dtype, 3 + ) + try: + model_name = tu.get_dyna_sequence_model_name(trial, dtype) + + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + corrids = [1001, 1002, 1003, 1004] + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 1, None), ("end", 3, None)), + self.get_expected_result( + 4 + corrids[0], corrids[0], 3, trial, "end" + ), + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 11, None), + (None, 12, None), + (None, 13, None), + ("end", 14, None), + ), + self.get_expected_result( + 50 + corrids[1], corrids[1], 14, trial, "end" + ), + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, value, pre_delay_ms) + (("start", 111, None), ("end", 113, None)), + self.get_expected_result( + 224 + corrids[2], corrids[2], 113, trial, "end" + ), + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_async, + args=( + trial, + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, value, pre_delay_ms) + ( + ("start", 1111, None), + (None, 1112, None), + (None, 1113, None), + ("end", 1114, None), + ), + self.get_expected_result( + 4450 + corrids[3], corrids[3], 1114, trial, "end" + ), + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) + + threads[1].start() + threads[3].start() + time.sleep(1) + threads[0].start() + threads[2].start() + for t in threads: + t.join() + self.check_deferred_exception() + if _model_instances == 1: + self.check_status(model_name, {4: 4}, 12, 12) + elif _model_instances == 2: + self.check_status(model_name, {2: 8}, 12, 12) + elif _model_instances == 4: + self.check_status(model_name, {1: 12}, 12, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if _test_system_shared_memory or _test_cuda_shared_memory: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_sequence_corrid_batcher/test.sh b/qa/L0_sequence_corrid_batcher/test.sh new file mode 100755 index 0000000000..8d114a395a --- /dev/null +++ b/qa/L0_sequence_corrid_batcher/test.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' + +CLIENT_LOG="./client.log" +BATCHER_TEST=sequence_corrid_batcher_test.py + +DATADIR=/data/inferenceserver/${REPO_VERSION} + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 + +# Must run on a single device or else the TRITONSERVER_DELAY_SCHEDULER +# can fail when the requests are distributed to multiple devices. +export CUDA_VISIBLE_DEVICES=0 + +# Setup non-variable-size model repositories. The same models are in each +# repository but they are configured as: +# models4 - four instances with batch-size 1 +rm -fr *.log models{0,1,2,4} && mkdir models4 +for m in \ + $DATADIR/qa_dyna_sequence_model_repository/graphdef_dyna_sequence_int32 \ + $DATADIR/qa_dyna_sequence_model_repository/savedmodel_dyna_sequence_int32 \ + $DATADIR/qa_dyna_sequence_model_repository/plan_dyna_sequence_int32 \ + $DATADIR/qa_dyna_sequence_model_repository/onnx_dyna_sequence_int32 \ + $DATADIR/qa_dyna_sequence_model_repository/libtorch_dyna_sequence_int32; do + cp -r $m models4/. && \ + (cd models4/$(basename $m) && \ + sed -i -z "s/oldest.*{.*}.*control_input/control_input/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 4/" config.pbtxt) +done + +# Same test work on all models since they all have same total number +# of batch slots. +for model_trial in 4; do + export NO_BATCHING=1 && + [[ "$model_trial" != "0" ]] && export NO_BATCHING=0 + export MODEL_INSTANCES=1 && + [[ "$model_trial" != "0" ]] && export MODEL_INSTANCES=$model_trial + + MODEL_DIR=models${model_trial} + + # Tests that require TRITONSERVER_DELAY_SCHEDULER so that the + # scheduler is delayed and requests can collect in the queue. + for i in test_skip_batch ; do + export TRITONSERVER_BACKLOG_DELAY_SCHEDULER=0 + export TRITONSERVER_DELAY_SCHEDULER=12 + SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR" + SERVER_LOG="./$i.$MODEL_DIR.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, repository $MODEL_DIR" >>$CLIENT_LOG + + set +e + python $BATCHER_TEST SequenceCorrIDBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_DELAY_SCHEDULER + unset TRITONSERVER_BACKLOG_DELAY_SCHEDULER + kill $SERVER_PID + wait $SERVER_PID + done +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_sequence_stress/sequence_stress.py b/qa/L0_sequence_stress/sequence_stress.py new file mode 100755 index 0000000000..bd71e9bcc2 --- /dev/null +++ b/qa/L0_sequence_stress/sequence_stress.py @@ -0,0 +1,657 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import argparse +import threading +import time +import traceback +from builtins import range, str +from functools import partial + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import np_to_triton_dtype + +if sys.version_info >= (3, 0): + import queue +else: + import Queue as queue + +FLAGS = None +CORRELATION_ID_BLOCK_SIZE = 100 +DEFAULT_TIMEOUT_MS = 5000 +SEQUENCE_LENGTH_MEAN = 16 +SEQUENCE_LENGTH_STDEV = 8 + +_thread_exceptions = [] +_thread_exceptions_mutex = threading.Lock() + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +# Callback function used for async_stream_infer() +def completion_callback(user_data, result, error): + # passing error raise and handling out + user_data._completed_requests.put((result, error)) + + +class TimeoutException(Exception): + pass + + +def check_sequence_async( + client_metadata, + trial, + model_name, + input_dtype, + steps, + timeout_ms=DEFAULT_TIMEOUT_MS, + sequence_name="", +): + """Perform sequence of inferences using async run. The 'steps' holds + a list of tuples, one for each inference with format: + + (flag_str, value, expected_result, delay_ms) + + """ + if ( + ("savedmodel" in trial) + or ("graphdef" in trial) + or ("custom" in trial) + or ("plan" in trial) + ): + tensor_shape = ( + 1, + 1, + ) + else: + assert False, "unknown trial type: " + trial + + triton_client = client_metadata[0] + sequence_id = client_metadata[1] + + # Execute the sequence of inference... + seq_start_ms = int(round(time.time() * 1000)) + user_data = UserData() + # Ensure there is no running stream + triton_client.stop_stream() + triton_client.start_stream(partial(completion_callback, user_data)) + + sent_count = 0 + for flag_str, value, expected_result, delay_ms in steps: + seq_start = False + seq_end = False + if flag_str is not None: + seq_start = "start" in flag_str + seq_end = "end" in flag_str + + if input_dtype == np.object_: + in0 = np.full(tensor_shape, value, dtype=np.int32) + in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) + in0 = in0n.reshape(tensor_shape) + else: + in0 = np.full(tensor_shape, value, dtype=input_dtype) + inputs = [ + grpcclient.InferInput( + "INPUT", tensor_shape, np_to_triton_dtype(input_dtype) + ), + ] + inputs[0].set_data_from_numpy(in0) + + triton_client.async_stream_infer( + model_name, + inputs, + sequence_id=sequence_id, + sequence_start=seq_start, + sequence_end=seq_end, + ) + sent_count += 1 + + if delay_ms is not None: + time.sleep(delay_ms / 1000.0) + + # Process the results in order that they were sent + result = None + processed_count = 0 + while processed_count < sent_count: + (results, error) = user_data._completed_requests.get() + if error is not None: + raise error + + (_, value, expected, _) = steps[processed_count] + processed_count += 1 + if timeout_ms != None: + now_ms = int(round(time.time() * 1000)) + if (now_ms - seq_start_ms) > timeout_ms: + raise TimeoutException("Timeout expired for {}".format(sequence_name)) + + result = results.as_numpy("OUTPUT")[0][0] + if FLAGS.verbose: + print("{} {}: + {} = {}".format(sequence_name, sequence_id, value, result)) + + if expected is not None: + if input_dtype == np.object_: + assert int(result) == expected, "{}: expected result {}, got {}".format( + sequence_name, expected, int(result) + ) + else: + assert result == expected, "{}: expected result {}, got {}".format( + sequence_name, expected, result + ) + triton_client.stop_stream() + + +def get_datatype(trial): + # Get the datatype to use based on what models are available (see test.sh) + if ("plan" in trial) or ("savedmodel" in trial): + return np.float32 + if "graphdef" in trial: + return np.dtype(object) + return np.int32 + + +def sequence_valid( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): + # Create a variable length sequence with "start" and "end" flags. + seqlen = max(1, int(rng.normal(len_mean, len_stddev))) + print("{} {}: valid seqlen = {}".format(sequence_name, client_metadata[1], seqlen)) + + values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype) + + steps = [] + expected_result = 0 + + for idx, step in enumerate(range(seqlen)): + flags = "" + if idx == 0: + flags += ",start" + if idx == (seqlen - 1): + flags += ",end" + + val = values[idx] + delay_ms = None + expected_result += val + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) + + +def sequence_valid_valid( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): + # Create two variable length sequences with "start" and "end" + # flags, where both sequences use the same correlation ID and are + # sent back-to-back. + seqlen = [ + max(1, int(rng.normal(len_mean, len_stddev))), + max(1, int(rng.normal(len_mean, len_stddev))), + ] + print( + "{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format( + sequence_name, client_metadata[1], seqlen[0], seqlen[1] + ) + ) + + values = [ + rng.randint(0, 1024 * 1024, size=seqlen[0], dtype=dtype), + rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype), + ] + + for p in [0, 1]: + steps = [] + expected_result = 0 + + for idx, step in enumerate(range(seqlen[p])): + flags = "" + if idx == 0: + flags += ",start" + if idx == (seqlen[p] - 1): + flags += ",end" + + val = values[p][idx] + delay_ms = None + expected_result += val + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) + + +def sequence_valid_no_end( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): + # Create two variable length sequences, the first with "start" and + # "end" flags and the second with no "end" flag, where both + # sequences use the same correlation ID and are sent back-to-back. + seqlen = [ + max(1, int(rng.normal(len_mean, len_stddev))), + max(1, int(rng.normal(len_mean, len_stddev))), + ] + print( + "{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format( + sequence_name, client_metadata[1], seqlen[0], seqlen[1] + ) + ) + + values = [ + rng.randint(0, 1024 * 1024, size=seqlen[0], dtype=dtype), + rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype), + ] + + for p in [0, 1]: + steps = [] + expected_result = 0 + + for idx, step in enumerate(range(seqlen[p])): + flags = "" + if idx == 0: + flags += ",start" + if (p == 0) and (idx == (seqlen[p] - 1)): + flags += ",end" + + val = values[p][idx] + delay_ms = None + expected_result += val + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) + + +def sequence_no_start(client_metadata, rng, trial, model_name, dtype, sequence_name): + # Create a sequence without a "start" flag. Sequence should get an + # error from the server. + seqlen = 1 + print( + "{} {}: no-start seqlen = {}".format(sequence_name, client_metadata[1], seqlen) + ) + + values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype) + + steps = [] + + for idx, step in enumerate(range(seqlen)): + flags = None + val = values[idx] + delay_ms = None + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, None, delay_ms), + ) + + try: + check_sequence_async( + client_metadata, + trial, + model_name, + dtype, + steps, + sequence_name=sequence_name, + ) + assert False, "expected inference failure from missing START flag" + except Exception as ex: + if "must specify the START flag" not in ex.message(): + raise + + +def sequence_no_end( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): + # Create a variable length sequence with "start" flag but that + # never ends. The sequence should be aborted by the server and its + # slot reused for another sequence. + seqlen = max(1, int(rng.normal(len_mean, len_stddev))) + print("{} {}: no-end seqlen = {}".format(sequence_name, client_metadata[1], seqlen)) + + values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype) + + steps = [] + expected_result = 0 + + for idx, step in enumerate(range(seqlen)): + flags = "" + if idx == 0: + flags = "start" + + val = values[idx] + delay_ms = None + expected_result += val + + # (flag_str, value, expected_result, delay_ms) + steps.append( + (flags, val, expected_result, delay_ms), + ) + + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) + + +def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, dtype): + # Thread responsible for generating sequences of inference + # requests. + global _thread_exceptions + + print("Starting thread {} with seed {}".format(name, seed)) + rng = np.random.RandomState(seed) + + client_metadata_list = [] + + try: + # Must use streaming GRPC context to ensure each sequences' + # requests are received in order. Create 2 common-use contexts + # with different correlation IDs that are used for most + # inference requests. Also create some rare-use contexts that + # are used to make requests with rarely-used correlation IDs. + # + # Need to remember the last choice for each context since we + # don't want some choices to follow others since that gives + # results not expected. See below for details. + common_cnt = 2 + rare_cnt = 8 + last_choices = [] + + for c in range(common_cnt + rare_cnt): + client_metadata_list.append( + ( + grpcclient.InferenceServerClient( + "localhost:8001", verbose=FLAGS.verbose + ), + correlation_id_base + c, + ) + ) + last_choices.append(None) + + rare_idx = 0 + for p in range(pass_cnt): + # Common or rare context? + if rng.rand() < 0.1: + # Rare context... + choice = rng.rand() + client_idx = common_cnt + rare_idx + + # Send a no-end, valid-no-end or valid-valid + # sequence... because it is a rare context this should + # exercise the idle sequence path of the sequence + # scheduler + if choice < 0.33: + sequence_no_end( + client_metadata_list[client_idx], + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) + last_choices[client_idx] = "no-end" + elif choice < 0.66: + sequence_valid_no_end( + client_metadata_list[client_idx], + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) + last_choices[client_idx] = "valid-no-end" + else: + sequence_valid_valid( + client_metadata_list[client_idx], + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) + last_choices[client_idx] = "valid-valid" + + rare_idx = (rare_idx + 1) % rare_cnt + else: + # Common context... + client_idx = 0 if rng.rand() < 0.5 else 1 + client_metadata = client_metadata_list[client_idx] + last_choice = last_choices[client_idx] + + choice = rng.rand() + + # no-start cannot follow no-end since the server will + # just assume that the no-start is a continuation of + # the no-end sequence instead of being a sequence + # missing start flag. + if ( + (last_choice != "no-end") + and (last_choice != "valid-no-end") + and (choice < 0.01) + ): + sequence_no_start( + client_metadata, + rng, + trial, + model_name, + dtype, + sequence_name=name, + ) + last_choices[client_idx] = "no-start" + elif choice < 0.05: + sequence_no_end( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) + last_choices[client_idx] = "no-end" + elif choice < 0.10: + sequence_valid_no_end( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) + last_choices[client_idx] = "valid-no-end" + elif choice < 0.15: + sequence_valid_valid( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) + last_choices[client_idx] = "valid-valid" + else: + sequence_valid( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) + last_choices[client_idx] = "valid" + + except Exception as ex: + _thread_exceptions_mutex.acquire() + try: + _thread_exceptions.append(traceback.format_exc()) + finally: + _thread_exceptions_mutex.release() + + # We need to explicitly close each client so that streams get + # cleaned up and closed correctly, otherwise the application + # can hang when exiting. + for c, i in client_metadata_list: + print("thread {} closing client {}".format(name, i)) + c.close() + + print("Exiting thread {}".format(name)) + + +def check_status(model_name): + client = grpcclient.InferenceServerClient("localhost:8001", verbose=FLAGS.verbose) + stats = client.get_inference_statistics(model_name) + print(stats) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-r", "--random-seed", type=int, required=False, help="Random seed." + ) + parser.add_argument( + "-t", + "--concurrency", + type=int, + required=False, + default=8, + help="Request concurrency. Default is 8.", + ) + parser.add_argument( + "-i", + "--iterations", + type=int, + required=False, + default=200, + help="Number of iterations of stress test to run. Default is 200.", + ) + FLAGS = parser.parse_args() + + # Initialize the random seed. For reproducibility each thread + # maintains its own RNG which is initialized based on this seed. + randseed = 0 + if FLAGS.random_seed != None: + randseed = FLAGS.random_seed + else: + randseed = int(time.time()) + np.random.seed(randseed) + + print("random seed = {}".format(randseed)) + print("concurrency = {}".format(FLAGS.concurrency)) + print("iterations = {}".format(FLAGS.iterations)) + + trial = "custom" + dtype = get_datatype(trial) + model_name = tu.get_sequence_model_name(trial, dtype) + + threads = [] + for idx, thd in enumerate(range(FLAGS.concurrency)): + thread_name = "thread_{}".format(idx) + + # Create the seed for the thread. Since these are created in + # reproducible order off of the initial seed we will get + # reproducible results when given the same seed. + seed = np.random.randint(2**32) + + # Each thread is reserved a block of correlation IDs or size + # CORRELATION_ID_BLOCK_SIZE + correlation_id_base = 1 + (idx * CORRELATION_ID_BLOCK_SIZE) + + threads.append( + threading.Thread( + target=stress_thread, + args=( + thread_name, + seed, + FLAGS.iterations, + correlation_id_base, + trial, + model_name, + dtype, + ), + ) + ) + + for t in threads: + t.start() + for t in threads: + t.join() + + check_status(model_name) + + _thread_exceptions_mutex.acquire() + try: + if len(_thread_exceptions) > 0: + for ex in _thread_exceptions: + print("*********\n{}".format(ex)) + sys.exit(1) + finally: + _thread_exceptions_mutex.release() + + print("Exiting stress test") + sys.exit(0) diff --git a/qa/L0_sequence_stress/test.sh b/qa/L0_sequence_stress/test.sh new file mode 100755 index 0000000000..b2bc66f8ac --- /dev/null +++ b/qa/L0_sequence_stress/test.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +STRESS_TEST=sequence_stress.py + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 + +# Setup model repository. +# models1 - one instance with batch-size 4 +# models2 - two instances with batch-size 2 +# models4 - four instances with batch-size 1 +rm -fr *.log models{1,2,4} && mkdir models{1,2,4} +for m in ../custom_models/custom_sequence_int32 ; do + cp -r $m models1/. && \ + (cd models1/$(basename $m) && \ + sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 1000000/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 1/" config.pbtxt) + cp -r $m models2/. && \ + (cd models2/$(basename $m) && \ + sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 1000000/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 2/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 2/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 2/" config.pbtxt) + cp -r $m models4/. && \ + (cd models4/$(basename $m) && \ + sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 1000000/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 1/" config.pbtxt && \ + sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 4/" config.pbtxt && \ + sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 4/" config.pbtxt) +done + +# Stress-test each model repository +for model_trial in 1 2 4 ; do + MODEL_DIR=models${model_trial} + SERVER_ARGS="--model-repository=`pwd`/$MODEL_DIR" + SERVER_LOG="./$MODEL_DIR.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + python $STRESS_TEST >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_server_status/server_status_test.py b/qa/L0_server_status/server_status_test.py old mode 100644 new mode 100755 index 4fc178f5ae..c54a4e8c0a --- a/qa/L0_server_status/server_status_test.py +++ b/qa/L0_server_status/server_status_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,134 +27,286 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + sys.path.append("../common") -from builtins import range -from future.utils import iteritems import os -import infer_util as iu import unittest -from tensorrtserver.api import * -import tensorrtserver.api.server_status_pb2 as server_status - - -def _get_server_status(url="localhost:8000", protocol=ProtocolType.HTTP, model_name=None): - ctx = ServerStatusContext(url, protocol, model_name, True) - return (ctx.get_server_status(), ctx.get_last_request_id()) +import infer_util as iu +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import * -class ServerStatusTest(unittest.TestCase): +class ServerMetadataTest(tu.TestResultCollector): def test_basic(self): try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - model_name0 = "graphdef_int32_int8_int8" - server_status0, req_id0 = _get_server_status(pair[0], pair[1], model_name0) - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], - server_status0.version) - self.assertEqual("inference:0", server_status0.id) - uptime0 = server_status0.uptime_ns - self.assertGreater(uptime0, 0) - self.assertEqual(len(server_status0.model_status), 1) - self.assertTrue(model_name0 in server_status0.model_status, - "expected status for model " + model_name0) - - model_name1 = "graphdef_float32_float32_float32" - server_status1, req_id1 = _get_server_status(pair[0], pair[1], model_name1) - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], - server_status1.version) - self.assertEqual("inference:0", server_status1.id) - uptime1 = server_status1.uptime_ns - self.assertEqual(len(server_status1.model_status), 1) - self.assertTrue(model_name1 in server_status1.model_status, - "expected status for model " + model_name1) - - self.assertGreater(uptime1, uptime0) - self.assertEqual(req_id1, req_id0 + 1) - - server_status2, req_id2 = _get_server_status(pair[0], pair[1]) - self.assertEqual(os.environ["TENSORRT_SERVER_VERSION"], - server_status2.version) - self.assertEqual("inference:0", server_status2.id) - uptime2 = server_status2.uptime_ns - for mn in (model_name0, model_name1, "netdef_float32_float32_float32", - "plan_float32_float32_float32"): - self.assertTrue(mn in server_status2.model_status, - "expected status for model " + model_name1) - - self.assertGreater(uptime2, uptime1) - self.assertEqual(req_id2, req_id1 + 1) - + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + model_name = "graphdef_int32_int8_int8" + extensions = [ + "classification", + "sequence", + "model_repository", + "schedule_policy", + "model_configuration", + "system_shared_memory", + "cuda_shared_memory", + "binary_tensor_data", + "statistics", + ] + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + server_metadata = triton_client.get_server_metadata() + model_metadata = triton_client.get_model_metadata(model_name) + + if pair[1] == "http": + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata["version"] + ) + self.assertEqual("triton", server_metadata["name"]) + for ext in extensions: + self.assertIn(ext, server_metadata["extensions"]) + + self.assertEqual(model_name, model_metadata["name"]) + else: + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata.version + ) + self.assertEqual("triton", server_metadata.name) + for ext in extensions: + self.assertIn(ext, server_metadata.extensions) + + self.assertEqual(model_name, model_metadata.name) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_unknown_model(self): try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - server_status = _get_server_status(pair[0], pair[1], "foo") + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + model_name = "foo" + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + server_metadata = triton_client.get_server_metadata() + if pair[1] == "http": + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata["version"] + ) + self.assertEqual("triton", server_metadata["name"]) + else: + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata.version + ) + self.assertEqual("triton", server_metadata.name) + + model_metadata = triton_client.get_model_metadata(model_name) self.assertTrue(False, "expected unknown model failure") except InferenceServerException as ex: - self.assertEqual("inference:0", ex.server_id()) - self.assertGreater(ex.request_id(), 0) self.assertTrue( - ex.message().startswith("no status available for unknown model")) + ex.message().startswith("Request for unknown model: 'foo' is not found") + ) + + def test_unknown_model_version(self): + try: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + model_name = "graphdef_int32_int8_int8" + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + + model_metadata = triton_client.get_model_metadata( + model_name, model_version="99" + ) + self.assertTrue(False, "expected unknown model version failure") + except InferenceServerException as ex: + self.assertTrue( + ex.message().startswith( + "Request for unknown model: 'graphdef_int32_int8_int8' version 99 is not found" + ) + ) def test_model_latest_infer(self): input_size = 16 - tensor_shape = (input_size,) + tensor_shape = (1, input_size) + platform_name = {"graphdef": "tensorflow_graphdef", "onnx": "onnxruntime_onnx"} # There are 3 versions of *_int32_int32_int32 and all # should be available. - for platform in ('graphdef', 'netdef'): + for platform in ("graphdef", "onnx"): model_name = platform + "_int32_int32_int32" # Initially there should be no version stats.. try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - server_status0, req_id0 = _get_server_status(pair[0], pair[1], model_name) - self.assertTrue(model_name in server_status0.model_status, - "expected status for model " + model_name) - self.assertEqual(len(server_status0.model_status[model_name].version_status), 3, - "expected status for 3 versions for model " + model_name) - for v in (1, 2, 3): - self.assertTrue(v in server_status0.model_status[model_name].version_status, - "expected version " + str(v) + " status for model " + model_name) - self.assertEqual(server_status0.model_status[model_name].version_status[v].ready_state, - server_status.MODEL_READY) + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + model_metadata = triton_client.get_model_metadata(model_name) + # verify all versions are reported when no model version is specified + if pair[1] == "http": + self.assertEqual(model_name, model_metadata["name"]) + self.assertEqual(len(model_metadata["versions"]), 3) + for v in (1, 2, 3): + self.assertIn(str(v), model_metadata["versions"]) + else: + self.assertEqual(model_name, model_metadata.name) + self.assertEqual(len(model_metadata.versions), 3) + for v in (1, 2, 3): + self.assertIn(str(v), model_metadata.versions) + + # verify contents of model metadata + if pair[1] == "http": + model_platform = model_metadata["platform"] + model_inputs = model_metadata["inputs"] + model_outputs = model_metadata["outputs"] + else: + model_platform = model_metadata.platform + model_inputs = model_metadata.inputs + model_outputs = model_metadata.outputs + + self.assertEqual(platform_name[platform], model_platform) + self.assertEqual(len(model_inputs), 2) + self.assertEqual(len(model_outputs), 2) + + for model_input in model_inputs: + if pair[1] == "http": + input_dtype = model_input["datatype"] + input_shape = model_input["shape"] + input_name = model_input["name"] + else: + input_dtype = model_input.datatype + input_shape = model_input.shape + input_name = model_input.name + self.assertIn(input_name, ["INPUT0", "INPUT1"]) + self.assertEqual(input_dtype, "INT32") + self.assertEqual(input_shape, [-1, 16]) + + for model_output in model_outputs: + if pair[1] == "http": + output_dtype = model_output["datatype"] + output_shape = model_output["shape"] + output_name = model_output["name"] + else: + output_dtype = model_output.datatype + output_shape = model_output.shape + output_name = model_output.name + self.assertIn(output_name, ["OUTPUT0", "OUTPUT1"]) + self.assertEqual(output_dtype, "INT32") + self.assertEqual(output_shape, [-1, 16]) + except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Infer using latest version (which is 3)... - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.int32, np.int32, np.int32, - model_version=None, swap=True) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + model_version=None, + swap=True, + ) try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - server_status1, req_id1 = _get_server_status(pair[0], pair[1], model_name) - self.assertTrue(model_name in server_status1.model_status, - "expected status for model " + model_name) - self.assertEqual(len(server_status1.model_status[model_name].version_status), 3, - "expected status for 3 versions for model " + model_name) + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) for v in (1, 2, 3): - self.assertTrue(v in server_status1.model_status[model_name].version_status, - "expected version " + str(v) + " status for model " + model_name) - self.assertEqual(server_status1.model_status[model_name].version_status[v].ready_state, - server_status.MODEL_READY) + self.assertTrue( + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) # Only version 3 should have infer stats - for v in (1, 2, 3): - version_status = server_status1.model_status[model_name].version_status[v] - if v == 3: - self.assertEqual(len(version_status.infer_stats), 1, - "expected 1 infer stats for v" + str(v) + " model " + model_name) - self.assertTrue(1 in version_status.infer_stats, - "expected batch 1 status for v" + str(v) + " model " + model_name) - infer_stats = version_status.infer_stats[1] - self.assertTrue(infer_stats.success.count, 1) + infer_stats = triton_client.get_inference_statistics(model_name) + if pair[1] == "http": + stats = infer_stats["model_stats"] + else: + stats = infer_stats.model_stats + self.assertEqual( + len(stats), 3, "expected 3 infer stats for model " + model_name + ) + for s in stats: + if pair[1] == "http": + v = s["version"] + stat = s["inference_stats"] + else: + v = s.version + stat = s.inference_stats + + if v == "3": + if pair[1] == "http": + self.assertTrue(stat["success"]["count"], 3) + else: + self.assertTrue(stat.success.count, 3) else: - self.assertEqual(len(version_status.infer_stats), 0, - "unexpected infer stats for v" + str(v) + " model " + model_name) + if pair[1] == "http": + self.assertEqual( + stat["success"]["count"], + 0, + "unexpected infer success counts for version " + + str(v) + + " of model " + + model_name, + ) + else: + self.assertEqual( + stat.success.count, + 0, + "unexpected infer success counts for version " + + str(v) + + " of model " + + model_name, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) @@ -162,141 +316,420 @@ def test_model_specific_infer(self): # There are 3 versions of *_float32_float32_float32 but only # versions 1 and 3 should be available. - for platform in ('graphdef', 'netdef', 'plan'): - tensor_shape = (input_size, 1, 1) if platform == 'plan' else (input_size,) + for platform in ("graphdef", "onnx", "plan"): + tensor_shape = (1, input_size) model_name = platform + "_float32_float32_float32" # Initially there should be no version status... try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - server_status0, req_id0 = _get_server_status(pair[0], pair[1], model_name) - self.assertTrue(model_name in server_status0.model_status, - "expected status for model " + model_name) - self.assertEqual(len(server_status0.model_status[model_name].version_status), 2, - "expected status for 2 versions for model " + model_name) - for v in (1, 3): - self.assertTrue(v in server_status0.model_status[model_name].version_status, - "expected version " + str(v) + " status for model " + model_name) - self.assertEqual(server_status0.model_status[model_name].version_status[v].ready_state, - server_status.MODEL_READY) + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue( + triton_client.is_model_ready(model_name, model_version="1") + ) + self.assertFalse( + triton_client.is_model_ready(model_name, model_version="2") + ) + self.assertTrue( + triton_client.is_model_ready(model_name, model_version="3") + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Infer using version 1... - iu.infer_exact(self, platform, tensor_shape, 1, True, - np.float32, np.float32, np.float32, - model_version=1, swap=False) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=1, + swap=False, + ) try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - server_status1, req_id1 = _get_server_status(pair[0], pair[1], model_name) - self.assertTrue(model_name in server_status1.model_status, - "expected status for model " + model_name) - self.assertEqual(len(server_status1.model_status[model_name].version_status), 2, - "expected status for 2 versions for model " + model_name) - for v in (1, 3): - self.assertTrue(v in server_status1.model_status[model_name].version_status, - "expected version " + str(v) + " status for model " + model_name) - self.assertEqual(server_status1.model_status[model_name].version_status[v].ready_state, - server_status.MODEL_READY) + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertTrue( + triton_client.is_model_ready(model_name, model_version="1") + ) + self.assertFalse( + triton_client.is_model_ready(model_name, model_version="2") + ) + self.assertTrue( + triton_client.is_model_ready(model_name, model_version="3") + ) # Only version 1 should have infer stats - for v in (1, 3): - version_status = server_status1.model_status[model_name].version_status[v] - if v == 1: - self.assertEqual(len(version_status.infer_stats), 1, - "expected 1 infer stats for v" + str(v) + " model " + model_name) - self.assertTrue(1 in version_status.infer_stats, - "expected batch 1 status for v" + str(v) + " model " + model_name) - infer_stats = version_status.infer_stats[1] - self.assertTrue(infer_stats.success.count, 1) - else: - self.assertEqual(len(version_status.infer_stats), 0, - "unexpected infer stats for v" + str(v) + " model " + model_name) + infer_stats = triton_client.get_inference_statistics( + model_name, model_version="1" + ) + if pair[1] == "http": + self.assertEqual( + len(infer_stats["model_stats"]), + 1, + "expected 1 infer stats for version 1" + " of model " + model_name, + ) + stats = infer_stats["model_stats"][0]["inference_stats"] + self.assertTrue(stats["success"]["count"], 3) + else: + self.assertEqual( + len(infer_stats.model_stats), + 1, + "expected 1 infer stats for version 1" + " of model " + model_name, + ) + stats = infer_stats.model_stats[0].inference_stats + self.assertTrue(stats.success.count, 3) + infer_stats = triton_client.get_inference_statistics( + model_name, model_version="3" + ) + if pair[1] == "http": + stats = infer_stats["model_stats"][0]["inference_stats"] + self.assertEqual( + stats["success"]["count"], + 0, + "unexpected infer stats for version 3" + " of model " + model_name, + ) + else: + stats = infer_stats.model_stats[0].inference_stats + self.assertEqual( + stats.success.count, + 0, + "unexpected infer stats for version 3" + " of model " + model_name, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) -class ModelStatusTest(unittest.TestCase): - ''' - These tests must be run after the ServerStatusTest. See test.sh +class ModelMetadataTest(tu.TestResultCollector): + """ + These tests must be run after the ServerMetadataTest. See test.sh file for correct test running. - ''' + """ + def test_model_versions_deleted(self): # Originally There were 3 versions of *_int32_int32_int32 and # version 3 was executed once. Version 2 and 3 models were - # deleted from the model store so now only expect version 1 to - # be ready and version 3 to show stats but not be ready. - for platform in ('graphdef', 'netdef'): + # deleted from the model repository so now only expect version 1 to + # be ready and show stats. + for platform in ("graphdef", "onnx"): model_name = platform + "_int32_int32_int32" try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - server_status1, req_id1 = _get_server_status(pair[0], pair[1], model_name) - self.assertTrue(model_name in server_status1.model_status, - "expected status for model " + model_name) - self.assertEqual(len(server_status1.model_status[model_name].version_status), 3, - "expected status for 3 versions for model " + model_name) + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + model_metadata = triton_client.get_model_metadata(model_name) + if pair[1] == "http": + self.assertEqual(model_name, model_metadata["name"]) + self.assertEqual(len(model_metadata["versions"]), 1) + self.assertEqual("1", model_metadata["versions"][0]) + else: + self.assertEqual(model_name, model_metadata.name) + self.assertEqual(len(model_metadata.versions), 1) + self.assertEqual("1", model_metadata.versions[0]) # Only version 3 should have infer stats, only 1 is ready for v in (1, 2, 3): - self.assertTrue(v in server_status1.model_status[model_name].version_status, - "expected version " + str(v) + " status for model " + model_name) - version_status = server_status1.model_status[model_name].version_status[v] if v == 1: - self.assertEqual(version_status.ready_state, server_status.MODEL_READY) - self.assertEqual(len(version_status.infer_stats), 0, - "unexpected infer stats for v" + str(v) + " model " + model_name) - else: - self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) - if v == 2: - self.assertEqual(len(version_status.infer_stats), 0, - "unexpected infer stats for v" + str(v) + " model " + model_name) + self.assertTrue( + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) + infer_stats = triton_client.get_inference_statistics( + model_name, model_version=str(v) + ) + if pair[1] == "http": + self.assertEqual( + len(infer_stats["model_stats"]), + 1, + "expected 1 infer stats for version " + + str(v) + + " of model " + + model_name, + ) + stats = infer_stats["model_stats"][0]["inference_stats"] + self.assertEqual(stats["success"]["count"], 0) else: - self.assertEqual(len(version_status.infer_stats), 1, - "expected 1 infer stats for v" + str(v) + " model " + model_name) - self.assertTrue(1 in version_status.infer_stats, - "expected batch 1 status for v" + str(v) + " model " + model_name) - infer_stats = version_status.infer_stats[1] - self.assertTrue(infer_stats.success.count, 1) + self.assertEqual( + len(infer_stats.model_stats), + 1, + "expected 1 infer stats for version " + + str(v) + + " of model " + + model_name, + ) + stats = infer_stats.model_stats[0].inference_stats + self.assertEqual(stats.success.count, 0) + + else: + self.assertFalse( + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_model_versions_added(self): # Originally There was version 1 of *_float16_float32_float32. - # Version 7 was added so now expect just version 7 to be ready. - for platform in ('graphdef',): + # Version 7 was added so now expect just version 7 to be ready + # and provide infer stats. + for platform in ("graphdef",): model_name = platform + "_float16_float32_float32" try: - for pair in [("localhost:8000", ProtocolType.HTTP), ("localhost:8001", ProtocolType.GRPC)]: - server_status1, req_id1 = _get_server_status(pair[0], pair[1], model_name) - self.assertTrue(model_name in server_status1.model_status, - "expected status for model " + model_name) - self.assertEqual(len(server_status1.model_status[model_name].version_status), 2, - "expected status for 2 versions for model " + model_name) - - for v in (1,): - self.assertTrue(v in server_status1.model_status[model_name].version_status, - "expected version " + str(v) + " status for model " + model_name) - version_status = server_status1.model_status[model_name].version_status[v] - self.assertEqual(version_status.ready_state, server_status.MODEL_UNAVAILABLE) - self.assertEqual(len(version_status.infer_stats), 0, - "unexpected infer stats for v" + str(v) + " model " + model_name) - - for v in (7,): - self.assertTrue(v in server_status1.model_status[model_name].version_status, - "expected version " + str(v) + " status for model " + model_name) - version_status = server_status1.model_status[model_name].version_status[v] - self.assertEqual(version_status.ready_state, server_status.MODEL_READY) - self.assertEqual(len(version_status.infer_stats), 0, - "unexpected infer stats for v" + str(v) + " model " + model_name) + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + model_metadata = triton_client.get_model_metadata(model_name) + if pair[1] == "http": + self.assertEqual( + model_name, + model_metadata["name"], + "expected status for model " + model_name, + ) + self.assertEqual( + len(model_metadata["versions"]), + 1, + "expected status for 1 versions for model " + model_name, + ) + self.assertEqual("7", model_metadata["versions"][0]) + else: + self.assertEqual( + model_name, + model_metadata.name, + "expected status for model " + model_name, + ) + self.assertEqual( + len(model_metadata.versions), + 1, + "expected status for 1 versions for model " + model_name, + ) + self.assertEqual("7", model_metadata.versions[0]) + + # Only version 7 should be ready and show infer stat. + for v in (1, 7): + if v == 7: + self.assertTrue( + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) + infer_stats = triton_client.get_inference_statistics( + model_name, model_version=str(v) + ) + if pair[1] == "http": + stats = infer_stats["model_stats"][0]["inference_stats"] + self.assertEqual( + stats["success"]["count"], + 0, + "unexpected infer stats for version " + + str(v) + + " of model " + + model_name, + ) + else: + stats = infer_stats.model_stats[0].inference_stats + self.assertEqual( + stats.success.count, + 0, + "unexpected infer stats for version " + + str(v) + + " of model " + + model_name, + ) + + else: + self.assertFalse( + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) + try: + infer_stats = triton_client.get_inference_statistics( + model_name, model_version=str(v) + ) + self.assertTrue( + False, + "unexpected infer stats for the model that is not ready", + ) + except InferenceServerException as ex: + self.assertIn( + "requested model version is not available for model", + str(ex), + ) + + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_infer_stats_no_model_version(self): + # Originally There were 3 versions of *_int32_int32_int32 and + # version 3 was executed once. Version 2 and 3 models were + # deleted from the model repository so now only expect version 1 to + # be ready and show infer stats. + for platform in ("graphdef", "onnx"): + model_name = platform + "_int32_int32_int32" + + try: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + model_metadata = triton_client.get_model_metadata(model_name) + if pair[1] == "http": + self.assertEqual(model_name, model_metadata["name"]) + self.assertEqual(len(model_metadata["versions"]), 1) + self.assertEqual("1", model_metadata["versions"][0]) + else: + self.assertEqual(model_name, model_metadata.name) + self.assertEqual(len(model_metadata.versions), 1) + self.assertEqual("1", model_metadata.versions[0]) + + # Only version 3 should have infer stats, only 1 is ready + for v in (1, 2, 3): + if v == 1: + self.assertTrue( + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) + else: + self.assertFalse( + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) + + infer_stats = triton_client.get_inference_statistics(model_name) + if pair[1] == "http": + stats = infer_stats["model_stats"] + else: + stats = infer_stats.model_stats + self.assertEqual( + len(stats), 1, "expected 1 infer stats for model " + model_name + ) + + if pair[1] == "http": + version = stats[0]["version"] + stat = stats[0]["inference_stats"] + else: + version = stats[0].version + stat = stats[0].inference_stats + + if version != "1": + self.assertTrue( + False, "expected version 1 for infer stat, got " + version + ) + else: + if pair[1] == "http": + self.assertEqual( + stat["success"]["count"], + 0, + "unexpected infer stats for version " + + str(version) + + " of model " + + model_name, + ) + else: + self.assertEqual( + stat.success.count, + 0, + "unexpected infer stats for version " + + str(version) + + " of model " + + model_name, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) + def test_infer_stats_no_model(self): + # Test get_inference_statistics when no model/model_version is passed. + try: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: + if pair[1] == "http": + triton_client = httpclient.InferenceServerClient( + url=pair[0], verbose=True + ) + else: + triton_client = grpcclient.InferenceServerClient( + url=pair[0], verbose=True + ) + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + + # Returns infer stats for ALL models + ready versions + infer_stats = triton_client.get_inference_statistics() + if pair[1] == "http": + stats = infer_stats["model_stats"] + else: + stats = infer_stats.model_stats + self.assertEqual( + len(stats), + 219, + "expected 219 infer stats for all ready versions of all model", + ) + + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_server_status/test.sh b/qa/L0_server_status/test.sh index 536d2a9043..1e27339a38 100755 --- a/qa/L0_server_status/test.sh +++ b/qa/L0_server_status/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,13 +25,31 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' CLIENT_LOG="./client.log" SERVER_STATUS_TEST=server_status_test.py +EXPECTED_NUM_TESTS_MMDT="4" +EXPECTED_NUM_TESTS_SMDT="5" -DATADIR=/data/inferenceserver +DATADIR=/data/inferenceserver/${REPO_VERSION} -SERVER=/opt/tensorrtserver/bin/trtserver -SERVER_ARGS="--repository-poll-secs=1 --model-store=`pwd`/models" +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--repository-poll-secs=1 --model-control-mode=poll --model-repository=`pwd`/models" SERVER_LOG="./inference_server.log" source ../common/util.sh @@ -47,35 +65,47 @@ fi RET=0 +set +e + rm -f $CLIENT_LOG -python $SERVER_STATUS_TEST ServerStatusTest >>$CLIENT_LOG 2>&1 +python $SERVER_STATUS_TEST ServerMetadataTest >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS_SMDT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi fi +set -e + rm -fr models/graphdef_int32_int32_int32/2 models/graphdef_int32_int32_int32/3 -rm -fr models/netdef_int32_int32_int32/2 models/netdef_int32_int32_int32/3 +rm -fr models/onnx_int32_int32_int32/2 models/onnx_int32_int32_int32/3 cp -r models/graphdef_float16_float32_float32/1 models/graphdef_float16_float32_float32/7 sleep 3 -python $SERVER_STATUS_TEST ModelStatusTest >>$CLIENT_LOG 2>&1 +set +e + +python $SERVER_STATUS_TEST ModelMetadataTest >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS_MMDT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi fi -# python unittest seems to swallow ImportError and still return 0 exit -# code. So need to explicitly check CLIENT_LOG to make sure we see -# some running tests -grep -c "HTTP/1.1 200 OK" $CLIENT_LOG -if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Failed To Run\n***" - RET=1 -fi +set -e kill $SERVER_PID wait $SERVER_PID diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py new file mode 100755 index 0000000000..871fca9b2a --- /dev/null +++ b/qa/L0_shared_memory/shared_memory_test.py @@ -0,0 +1,615 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import time +import unittest +from functools import partial + +import infer_util as iu +import numpy as np +import psutil +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +import tritonclient.utils.shared_memory as shm +from tritonclient import utils + + +class SystemSharedMemoryTestBase(tu.TestResultCollector): + DEFAULT_SHM_BYTE_SIZE = 64 + + def setUp(self): + self._setup_client() + + def _setup_client(self): + self.protocol = os.environ.get("CLIENT_TYPE", "http") + if self.protocol == "http": + self.url = "localhost:8000" + self.triton_client = httpclient.InferenceServerClient( + self.url, verbose=True + ) + else: + self.url = "localhost:8001" + self.triton_client = grpcclient.InferenceServerClient( + self.url, verbose=True + ) + + def _configure_server( + self, + create_byte_size=DEFAULT_SHM_BYTE_SIZE, + register_byte_size=DEFAULT_SHM_BYTE_SIZE, + register_offset=0, + ): + """Creates and registers shared memory regions for testing. + + Parameters + ---------- + create_byte_size: int + Size of each system shared memory region to create. + NOTE: This should be sufficiently large to hold the inputs/outputs + stored in shared memory. + + register_byte_size: int + Size of each system shared memory region to register with server. + NOTE: The (offset + register_byte_size) should be less than or equal + to the create_byte_size. Otherwise an exception will be raised for + an invalid set of registration args. + + register_offset: int + Offset into the shared memory object to start the registered region. + + """ + shm_ip0_handle = shm.create_shared_memory_region( + "input0_data", "/input0_data", create_byte_size + ) + shm_ip1_handle = shm.create_shared_memory_region( + "input1_data", "/input1_data", create_byte_size + ) + shm_op0_handle = shm.create_shared_memory_region( + "output0_data", "/output0_data", create_byte_size + ) + shm_op1_handle = shm.create_shared_memory_region( + "output1_data", "/output1_data", create_byte_size + ) + # Implicit assumption that input and output byte_sizes are 64 bytes for now + input0_data = np.arange(start=0, stop=16, dtype=np.int32) + input1_data = np.ones(shape=16, dtype=np.int32) + shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) + shm.set_shared_memory_region(shm_ip1_handle, [input1_data]) + self.triton_client.register_system_shared_memory( + "input0_data", "/input0_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "input1_data", "/input1_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "output0_data", "/output0_data", register_byte_size, offset=register_offset + ) + self.triton_client.register_system_shared_memory( + "output1_data", "/output1_data", register_byte_size, offset=register_offset + ) + return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle] + + def _cleanup_server(self, shm_handles): + for shm_handle in shm_handles: + shm.destroy_shared_memory_region(shm_handle) + + +class SharedMemoryTest(SystemSharedMemoryTestBase): + def test_invalid_create_shm(self): + # Raises error since tried to create invalid system shared memory region + try: + shm_op0_handle = shm.create_shared_memory_region( + "dummy_data", "/dummy_data", -1 + ) + shm.destroy_shared_memory_region(shm_op0_handle) + except Exception as ex: + self.assertTrue(str(ex) == "unable to initialize the size") + + def test_valid_create_set_register(self): + # Create a valid system shared memory region, fill data in it and register + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) + shm.set_shared_memory_region( + shm_op0_handle, [np.array([1, 2], dtype=np.float32)] + ) + self.triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) + shm_status = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(shm_status) == 1) + else: + self.assertTrue(len(shm_status.regions) == 1) + shm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_before_register(self): + # Create a valid system shared memory region and unregister before register + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) + self.triton_client.unregister_system_shared_memory("dummy_data") + shm_status = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(shm_status) == 0) + else: + self.assertTrue(len(shm_status.regions) == 0) + shm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_after_register(self): + # Create a valid system shared memory region and unregister after register + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) + self.triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) + self.triton_client.unregister_system_shared_memory("dummy_data") + shm_status = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(shm_status) == 0) + else: + self.assertTrue(len(shm_status.regions) == 0) + shm.destroy_shared_memory_region(shm_op0_handle) + + def test_reregister_after_register(self): + # Create a valid system shared memory region and unregister after register + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) + self.triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) + try: + self.triton_client.register_system_shared_memory( + "dummy_data", "/dummy_data", 8 + ) + except Exception as ex: + self.assertIn( + "shared memory region 'dummy_data' already in manager", str(ex) + ) + shm_status = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(shm_status) == 1) + else: + self.assertTrue(len(shm_status.regions) == 1) + shm.destroy_shared_memory_region(shm_op0_handle) + + def test_unregister_after_inference(self): + # Unregister after inference + error_msg = [] + shm_handles = self._configure_server() + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + protocol=self.protocol, + use_system_shared_memory=True, + ) + if len(error_msg) > 0: + raise Exception(str(error_msg)) + self.triton_client.unregister_system_shared_memory("output0_data") + shm_status = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(shm_status) == 3) + else: + self.assertTrue(len(shm_status.regions) == 3) + self._cleanup_server(shm_handles) + + def test_register_after_inference(self): + # Register after inference + error_msg = [] + shm_handles = self._configure_server() + + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + protocol=self.protocol, + use_system_shared_memory=True, + ) + + if len(error_msg) > 0: + raise Exception(str(error_msg)) + shm_ip2_handle = shm.create_shared_memory_region( + "input2_data", "/input2_data", self.DEFAULT_SHM_BYTE_SIZE + ) + self.triton_client.register_system_shared_memory( + "input2_data", "/input2_data", self.DEFAULT_SHM_BYTE_SIZE + ) + shm_status = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(shm_status) == 5) + else: + self.assertTrue(len(shm_status.regions) == 5) + shm_handles.append(shm_ip2_handle) + self._cleanup_server(shm_handles) + + def test_too_big_shm(self): + # Shared memory input region larger than needed - Throws error + error_msg = [] + shm_handles = self._configure_server() + shm_ip2_handle = shm.create_shared_memory_region( + "input2_data", "/input2_data", 128 + ) + self.triton_client.register_system_shared_memory( + "input2_data", "/input2_data", 128 + ) + + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_ip2_handle, + shm_handles[2], + shm_handles[3], + error_msg, + big_shm_name="input2_data", + big_shm_size=128, + protocol=self.protocol, + use_system_shared_memory=True, + ) + if len(error_msg) > 0: + self.assertIn( + "input byte size mismatch for input 'INPUT1' for model 'simple'. Expected 64, got 128", + error_msg[-1], + ) + shm_handles.append(shm_ip2_handle) + self._cleanup_server(shm_handles) + + def test_mixed_raw_shm(self): + # Mix of shared memory and RAW inputs + error_msg = [] + shm_handles = self._configure_server() + input1_data = np.ones(shape=16, dtype=np.int32) + + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + [input1_data], + shm_handles[2], + shm_handles[3], + error_msg, + protocol=self.protocol, + use_system_shared_memory=True, + ) + if len(error_msg) > 0: + raise Exception(error_msg[-1]) + self._cleanup_server(shm_handles) + + def test_unregisterall(self): + # Unregister all shared memory blocks + shm_handles = self._configure_server() + status_before = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(status_before) == 4) + else: + self.assertTrue(len(status_before.regions) == 4) + self.triton_client.unregister_system_shared_memory() + status_after = self.triton_client.get_system_shared_memory_status() + if self.protocol == "http": + self.assertTrue(len(status_after) == 0) + else: + self.assertTrue(len(status_after.regions) == 0) + self._cleanup_server(shm_handles) + + def test_infer_offset_out_of_bound(self): + # Shared memory offset outside output region - Throws error + error_msg = [] + shm_handles = self._configure_server() + if self.protocol == "http": + # -32 when placed in an int64 signed type, to get a negative offset + # by overflowing + offset = 2**64 - 32 + else: + # gRPC will throw an error if > 2**63 - 1, so instead test for + # exceeding shm region size by 1 byte, given its size is 64 bytes + offset = 64 + + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + shm_output_offset=offset, + protocol=self.protocol, + use_system_shared_memory=True, + ) + + self.assertEqual(len(error_msg), 1) + self.assertIn("Invalid offset for shared memory region", error_msg[0]) + self._cleanup_server(shm_handles) + + def test_infer_byte_size_out_of_bound(self): + # Shared memory byte_size outside output region - Throws error + error_msg = [] + shm_handles = self._configure_server() + offset = 60 + byte_size = self.DEFAULT_SHM_BYTE_SIZE + + iu.shm_basic_infer( + self, + self.triton_client, + shm_handles[0], + shm_handles[1], + shm_handles[2], + shm_handles[3], + error_msg, + shm_output_offset=offset, + shm_output_byte_size=byte_size, + protocol=self.protocol, + use_system_shared_memory=True, + ) + self.assertEqual(len(error_msg), 1) + self.assertIn( + "Invalid offset + byte size for shared memory region", error_msg[0] + ) + self._cleanup_server(shm_handles) + + def test_register_out_of_bound(self): + create_byte_size = self.DEFAULT_SHM_BYTE_SIZE + + # Verify various edge cases of registered region size (offset+byte_size) + # don't go out of bounds of the actual created shm file object's size. + with self.assertRaisesRegex( + utils.InferenceServerException, + "failed to register shared memory region.*invalid args", + ): + self._configure_server( + create_byte_size=create_byte_size, + register_byte_size=create_byte_size + 1, + register_offset=0, + ) + + with self.assertRaisesRegex( + utils.InferenceServerException, + "failed to register shared memory region.*invalid args", + ): + self._configure_server( + create_byte_size=create_byte_size, + register_byte_size=create_byte_size, + register_offset=1, + ) + + with self.assertRaisesRegex( + utils.InferenceServerException, + "failed to register shared memory region.*invalid args", + ): + self._configure_server( + create_byte_size=create_byte_size, + register_byte_size=1, + register_offset=create_byte_size, + ) + + with self.assertRaisesRegex( + utils.InferenceServerException, + "failed to register shared memory region.*invalid args", + ): + self._configure_server( + create_byte_size=create_byte_size, + register_byte_size=0, + register_offset=create_byte_size + 1, + ) + + def test_python_client_leak(self): + process = psutil.Process() + initial_mem_usage = process.memory_info().rss / 1024**2 + threshold = initial_mem_usage * 1.02 # 2% tolerance threshold + + byte_size = 4 + i = 0 + while i < 100000: + if i % 5000 == 0: + print( + f"[iter: {i:<8}] Memory Usage:", + process.memory_info().rss / 1024**2, + "MiB", + ) + + shm_handle = shm.create_shared_memory_region( + "shmtest", "/shmtest", byte_size + ) + shm.destroy_shared_memory_region(shm_handle) + i += 1 + final_mem_usage = process.memory_info().rss / 1024**2 + self.assertTrue( + (initial_mem_usage <= final_mem_usage <= threshold), + "client memory usage is increasing", + ) + + +class TestSharedMemoryUnregister(SystemSharedMemoryTestBase): + def _test_unregister_shm_fail(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory() + self.assertIn( + "Failed to unregister the following system shared memory regions: input0_data ,input1_data ,output0_data ,output1_data", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("input0_data") + self.assertIn( + "Cannot unregister shared memory region 'input0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("input1_data") + self.assertIn( + "Cannot unregister shared memory region 'input1_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("output0_data") + self.assertIn( + "Cannot unregister shared memory region 'output0_data', it is currently in use.", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.unregister_system_shared_memory("output1_data") + self.assertIn( + "Cannot unregister shared memory region 'output1_data', it is currently in use.", + str(ex.exception), + ) + + def _test_shm_not_found(self): + second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("input0_data") + self.assertIn( + "Unable to find system shared memory region: 'input0_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("input1_data") + self.assertIn( + "Unable to find system shared memory region: 'input1_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("output0_data") + self.assertIn( + "Unable to find system shared memory region: 'output0_data'", + str(ex.exception), + ) + + with self.assertRaises(utils.InferenceServerException) as ex: + second_client.get_system_shared_memory_status("output1_data") + self.assertIn( + "Unable to find system shared memory region: 'output1_data'", + str(ex.exception), + ) + + def test_unregister_shm_during_inference_http(self): + try: + self.triton_client.unregister_system_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + httpclient.InferInput("INPUT0", [1, 16], "INT32"), + httpclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + httpclient.InferRequestedOutput("OUTPUT0", binary_data=True), + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + async_request = self.triton_client.async_infer( + model_name="simple", inputs=inputs, outputs=outputs + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Blocking call + async_request.get_result() + + # Try unregister shm regions after inference + self.triton_client.unregister_system_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + def test_unregister_shm_during_inference_grpc(self): + try: + self.triton_client.unregister_system_shared_memory() + shm_handles = self._configure_server() + + inputs = [ + grpcclient.InferInput("INPUT0", [1, 16], "INT32"), + grpcclient.InferInput("INPUT1", [1, 16], "INT32"), + ] + outputs = [ + grpcclient.InferRequestedOutput("OUTPUT0"), + grpcclient.InferRequestedOutput("OUTPUT1"), + ] + + inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE) + inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE) + outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE) + + def callback(user_data, result, error): + if error: + user_data.append(error) + else: + user_data.append(result) + + user_data = [] + + self.triton_client.async_infer( + model_name="simple", + inputs=inputs, + outputs=outputs, + callback=partial(callback, user_data), + ) + + # Ensure inference started + time.sleep(2) + + # Try unregister shm regions during inference + self._test_unregister_shm_fail() + + # Wait until the results are available in user_data + time_out = 20 + while (len(user_data) == 0) and time_out > 0: + time_out = time_out - 1 + time.sleep(1) + time.sleep(2) + + # Try unregister shm regions after inference + self.triton_client.unregister_system_shared_memory() + self._test_shm_not_found() + + finally: + self._cleanup_server(shm_handles) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_shared_memory/test.sh b/qa/L0_shared_memory/test.sh new file mode 100755 index 0000000000..e711de9cff --- /dev/null +++ b/qa/L0_shared_memory/test.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CLIENT_LOG="./client.log" +SHM_TEST=shared_memory_test.py +TEST_RESULT_FILE='test_results.txt' + +# Configure to support test on jetson as well +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends +SERVER_ARGS_EXTRA="--backend-directory=${BACKEND_DIR}" +source ../common/util.sh +pip3 install psutil + +RET=0 +rm -fr *.log + +for i in \ + test_invalid_create_shm \ + test_valid_create_set_register \ + test_unregister_before_register \ + test_unregister_after_register \ + test_reregister_after_register \ + test_unregister_after_inference \ + test_register_after_inference \ + test_too_big_shm \ + test_mixed_raw_shm \ + test_unregisterall \ + test_infer_offset_out_of_bound \ + test_infer_byte_size_out_of_bound \ + test_register_out_of_bound \ + test_python_client_leak; do + for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1 ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./$i.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + TMP_CLIENT_LOG="./tmp_client.log" + echo "Test: $i, client type: $client_type" >>$TMP_CLIENT_LOG + + set +e + python3 $SHM_TEST SharedMemoryTest.$i >>$TMP_CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $TMP_CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + cat $TMP_CLIENT_LOG >>$CLIENT_LOG + rm $TMP_CLIENT_LOG + kill $SERVER_PID + wait $SERVER_PID + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Server shut down non-gracefully\n***" + RET=1 + fi + set -e + done +done + +mkdir -p python_models/simple/1/ +cp ../python_models/execute_delayed_model/model.py ./python_models/simple/1/ +cp ../python_models/execute_delayed_model/config.pbtxt ./python_models/simple/ + +for client_type in http grpc; do + SERVER_ARGS="--model-repository=`pwd`/python_models --log-verbose=1 ${SERVER_ARGS_EXTRA}" + SERVER_LOG="./unregister_shm.$client_type.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + export CLIENT_TYPE=$client_type + CLIENT_LOG="./unregister_shm.$client_type.client.log" + set +e + python3 $SHM_TEST TestSharedMemoryUnregister.test_unregister_shm_during_inference_$client_type >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $TEST_RESULT_FILE + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + kill $SERVER_PID + wait $SERVER_PID + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Server shut down non-gracefully\n***" + RET=1 + fi + set -e + done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_simple_ensemble/ensemble_test.py b/qa/L0_simple_ensemble/ensemble_test.py new file mode 100755 index 0000000000..db516651df --- /dev/null +++ b/qa/L0_simple_ensemble/ensemble_test.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import random +import sys +import time +from functools import partial + +import numpy as np +import tritonclient.grpc as grpcclient + +sys.path.append("../common") +sys.path.append("../clients") + +import logging +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu +import tritonhttpclient + + +# Utility function to Generate N requests with appropriate sequence flags +class RequestGenerator: + def __init__(self, init_value, num_requests) -> None: + self.count = 0 + self.init_value = init_value + self.num_requests = num_requests + + def __enter__(self): + return self + + def __iter__(self): + return self + + def __next__(self) -> bytes: + value = self.init_value + self.count + if self.count == self.num_requests: + raise StopIteration + start = True if self.count == 0 else False + end = True if self.count == self.num_requests - 1 else False + self.count = self.count + 1 + return start, end, self.count - 1, value + + +class EnsembleTest(tu.TestResultCollector): + def _get_infer_count_per_version(self, model_name): + triton_client = tritonhttpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + stats = triton_client.get_inference_statistics(model_name) + self.assertEqual(len(stats["model_stats"]), 2) + infer_count = [0, 0] + for model_stat in stats["model_stats"]: + self.assertEqual( + model_stat["name"], model_name, "expected stats for model " + model_name + ) + model_version = model_stat["version"] + if model_version == "1": + infer_count[0] = model_stat["inference_stats"]["success"]["count"] + elif model_version == "2": + infer_count[1] = model_stat["inference_stats"]["success"]["count"] + else: + self.assertTrue( + False, + "unexpected version {} for model {}".format( + model_version, model_name + ), + ) + return infer_count + + def test_ensemble_add_sub(self): + for bs in (1, 8): + iu.infer_exact( + self, "ensemble_add_sub", (bs, 16), bs, np.int32, np.int32, np.int32 + ) + + infer_count = self._get_infer_count_per_version("simple") + # The two 'simple' versions should have the same infer count + if infer_count[0] != infer_count[1]: + self.assertTrue( + False, "unexpeced different infer count for different 'simple' versions" + ) + + def test_ensemble_add_sub_one_output(self): + for bs in (1, 8): + iu.infer_exact( + self, + "ensemble_add_sub", + (bs, 16), + bs, + np.int32, + np.int32, + np.int32, + outputs=("OUTPUT0",), + ) + + infer_count = self._get_infer_count_per_version("simple") + # Only 'simple' version 2 should have non-zero infer count + # as it is in charge of producing OUTPUT0 + if infer_count[0] != 0: + self.assertTrue( + False, "unexpeced non-zero infer count for 'simple' version 1" + ) + elif infer_count[1] == 0: + self.assertTrue(False, "unexpeced zero infer count for 'simple' version 2") + + def test_ensemble_sequence_flags(self): + request_generator = RequestGenerator(0, 3) + # 3 request made expect the START of 1st req to be true and + # END of last request to be true + expected_flags = [[True, False], [False, False], [False, True]] + response_flags = [] + + def callback(start_time, result, error): + response = result.get_response() + arr = [] + arr.append(response.parameters["sequence_start"].bool_param) + arr.append(response.parameters["sequence_end"].bool_param) + response_flags.append(arr) + + start_time = time.time() + triton_client = grpcclient.InferenceServerClient("localhost:8001") + triton_client.start_stream(callback=partial(callback, start_time)) + correlation_id = random.randint(1, 2**31 - 1) + # create input tensors + input0_data = np.random.randint(0, 100, size=(1, 16), dtype=np.int32) + input1_data = np.random.randint(0, 100, size=(1, 16), dtype=np.int32) + + inputs = [ + grpcclient.InferInput("INPUT0", input0_data.shape, "INT32"), + grpcclient.InferInput("INPUT1", input1_data.shape, "INT32"), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + # create output tensors + outputs = [grpcclient.InferRequestedOutput("OUTPUT0")] + for sequence_start, sequence_end, count, input_value in request_generator: + triton_client.async_stream_infer( + model_name="ensemble_add_sub_int32_int32_int32", + inputs=inputs, + outputs=outputs, + request_id=f"{correlation_id}_{count}", + sequence_id=correlation_id, + sequence_start=sequence_start, + sequence_end=sequence_end, + ) + time.sleep(2) + if expected_flags != response_flags: + self.assertTrue(False, "unexpeced sequence flags mismatch error") + + +if __name__ == "__main__": + logging.basicConfig(stream=sys.stderr) + unittest.main() diff --git a/qa/L0_simple_ensemble/models/ensemble_add_sub_int32_int32_int32/config.pbtxt b/qa/L0_simple_ensemble/models/ensemble_add_sub_int32_int32_int32/config.pbtxt new file mode 100644 index 0000000000..1b04885428 --- /dev/null +++ b/qa/L0_simple_ensemble/models/ensemble_add_sub_int32_int32_int32/config.pbtxt @@ -0,0 +1,153 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble_add_sub_int32_int32_int32" +platform: "ensemble" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "simple" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT0" + } + output_map { + key: "OUTPUT0" + value: "double_input0" + } + }, + { + model_name: "simple" + model_version: 1 + input_map { + key: "INPUT0" + value: "INPUT1" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "double_input1" + } + }, + { + model_name: "simple" + model_version: -1 + input_map { + key: "INPUT0" + value: "double_input0" + } + input_map { + key: "INPUT1" + value: "INPUT0" + } + output_map { + key: "OUTPUT1" + value: "input0_val" + } + }, + { + model_name: "simple" + model_version: 1 + input_map { + key: "INPUT0" + value: "double_input1" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT1" + value: "input1_val" + } + }, + { + model_name: "simple" + model_version: -1 + input_map { + key: "INPUT0" + value: "input0_val" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + }, + { + model_name: "simple" + model_version: 1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "input1_val" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/L0_simple_ensemble/models/simple/config.pbtxt b/qa/L0_simple_ensemble/models/simple/config.pbtxt new file mode 100644 index 0000000000..7e7a178fc1 --- /dev/null +++ b/qa/L0_simple_ensemble/models/simple/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple" +platform: "tensorflow_graphdef" +max_batch_size: 8 +version_policy: { all {} } +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +instance_group [ + { + kind: KIND_CPU + } +] diff --git a/qa/L0_simple_ensemble/test.sh b/qa/L0_simple_ensemble/test.sh new file mode 100755 index 0000000000..0a3c27a2f8 --- /dev/null +++ b/qa/L0_simple_ensemble/test.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +SIMPLE_TEST_PY=./ensemble_test.py + +CLIENT_LOG="./client.log" + +TEST_RESULT_FILE='test_results.txt' +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +# ensure ensemble model has version sub-directory +mkdir -p `pwd`/models/ensemble_add_sub_int32_int32_int32/1 + +rm -f $CLIENT_LOG $SERVER_LOG + +# Run ensemble model with all outputs requested +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +set +e +python $SIMPLE_TEST_PY EnsembleTest.test_ensemble_add_sub >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Run ensemble model with sequence flags and verify response sequence +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $SIMPLE_TEST_PY EnsembleTest.test_ensemble_sequence_flags >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Run ensemble model with only one output requested +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $SIMPLE_TEST_PY EnsembleTest.test_ensemble_add_sub_one_output >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_simple_example/test.sh b/qa/L0_simple_example/test.sh index 4be9be7e7b..d2d4f4b505 100755 --- a/qa/L0_simple_example/test.sh +++ b/qa/L0_simple_example/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,17 +25,17 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -SIMPLE_CLIENT=../clients/simple_client -SIMPLE_CLIENT_PY=../clients/simple_client.py +export CUDA_VISIBLE_DEVICES=0 -CLIENT_LOG="./client.log" +SIMPLE_CLIENT=../clients/simple_http_infer_client +SIMPLE_CLIENT_PY=../clients/simple_http_infer_client.py -SERVER=/opt/tensorrtserver/bin/trtserver -SERVER_ARGS=--model-store=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" SERVER_LOG="./inference_server.log" source ../common/util.sh -rm -f $CLIENT_LOG $SERVER_LOG +rm -f *.log run_server if [ "$SERVER_PID" == "0" ]; then @@ -46,23 +46,87 @@ fi RET=0 -$SIMPLE_CLIENT -v >>$CLIENT_LOG 2>&1 +set +e + +# Run with default host header... +$SIMPLE_CLIENT -v >>client_c++.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +if [ `grep -c "localhost:8000" client_c++.log` != "2" ]; then + echo -e "\n***\n*** Failed. Expected 2 Host:localhost:8000 headers for C++ client\n***" + RET=1 +fi + +python $SIMPLE_CLIENT_PY -v >>client_py.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +if [ `grep -c "HTTPSocketPoolResponse status=200" client_py.log` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 Host:HTTPSocketPoolResponse status=200 headers for Python client\n***" + RET=1 +fi + +# Run with custom host header... +$SIMPLE_CLIENT -v -H"Host:my_host_" >>client_c++_host.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +if [ `grep -c my_host_ client_c++_host.log` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 Host:my_host_ headers for C++ client\n***" + RET=1 +fi + +python $SIMPLE_CLIENT_PY -v -H"Host:my_host_" >>client_py_host.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +if [ `grep -c my_host_ client_py_host.log` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 Host:my_host_ headers for Python client\n***" + RET=1 +fi + +# Run with multiple headers... +$SIMPLE_CLIENT -v -H"abc:xyz" -H"123:456" >>client_c++_multi.log 2>&1 if [ $? -ne 0 ]; then RET=1 fi -python $SIMPLE_CLIENT_PY -v >>$CLIENT_LOG 2>&1 +if [ `grep -c "abc: xyz" client_c++_multi.log` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 abc:xyz headers for C++ client\n***" + RET=1 +fi +if [ `grep -c "123: 456" client_c++_multi.log` != "1" ]; then + echo -e "\n***\n*** Failed. Expected 1 123:456 headers for C++ client\n***" + RET=1 +fi + +python $SIMPLE_CLIENT_PY -v -H"abc:xyz" -H"123:456" >>client_py_multi.log 2>&1 if [ $? -ne 0 ]; then RET=1 fi +if [ `grep -c "'abc': 'xyz'" client_py_multi.log` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 abc:xyz headers for Python client\n***" + RET=1 +fi +if [ `grep -c "'123': '456'" client_py_multi.log` != "3" ]; then + echo -e "\n***\n*** Failed. Expected 3 123:456 headers for Python client\n***" + RET=1 +fi + +set -e + kill $SERVER_PID wait $SERVER_PID if [ $RET -eq 0 ]; then - echo -e "\n***\n*** Test Passed\n***" + echo -e "\n***\n*** Test Passed\n***" else - cat $CLIENT_LOG echo -e "\n***\n*** Test FAILED\n***" fi diff --git a/qa/L0_simple_go_client/test.sh b/qa/L0_simple_go_client/test.sh new file mode 100755 index 0000000000..ec84bdb8cc --- /dev/null +++ b/qa/L0_simple_go_client/test.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +TRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG:="main"} + +GO_CLIENT_DIR=client/src/grpc_generated/go + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS=--model-repository=`pwd`/models +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f *.log + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +# Generate Go stubs. +rm -fr client common +git clone ${TRITON_REPO_ORGANIZATION}/client.git +go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest + +pushd ${GO_CLIENT_DIR} + +git clone --single-branch --depth=1 -b $TRITON_COMMON_REPO_TAG \ + ${TRITON_REPO_ORGANIZATION}/common.git +bash gen_go_stubs.sh + +set +e + +# Run test for GRPC variant of go client within go.mod path +go run grpc_simple_client.go >>client.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +popd + + +if [ `grep -c "Checking Inference Outputs" ${GO_CLIENT_DIR}/client.log` != "1" ]; then + echo -e "\n***\n*** Failed. Unable to run inference.\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_simple_lib/test.sh b/qa/L0_simple_lib/test.sh new file mode 100755 index 0000000000..7045f512ef --- /dev/null +++ b/qa/L0_simple_lib/test.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +MODELSDIR=`pwd`/models +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository + +export CUDA_VISIBLE_DEVICES=0 + +# Must explicitly set LD_LIBRARY_PATH so that clients can find +# libtritonserver.so. +LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH + +rm -f *.log + +RET=0 + +for SIMPLE_CLIENT in simple ; do + CLIENT_LOG=$SIMPLE_CLIENT + SIMPLE_CLIENT=./$SIMPLE_CLIENT + + for trial in graphdef savedmodel onnx libtorch plan; do + full=${trial}_float32_float32_float32 + rm -rf $MODELSDIR + mkdir -p $MODELSDIR/simple/1 && \ + cp -r $DATADIR/${full}/1/* $MODELSDIR/simple/1/. && \ + cp $DATADIR/${full}/config.pbtxt $MODELSDIR/simple/. && \ + (cd $MODELSDIR/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt && \ + sed -i "s/label_filename:.*//" config.pbtxt) + + set +e + + # No memory type enforcement + $SIMPLE_CLIENT -r $MODELSDIR >>$CLIENT_LOG.$full.log 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG.$full.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + # Enforce I/O to be in specific memory type + for MEM_TYPE in system pinned gpu ; do + $SIMPLE_CLIENT -r $MODELSDIR -m $MEM_TYPE >>$CLIENT_LOG.$full.$MEM_TYPE.log 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG.$full.$MEM_TYPE.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + done + + set -e + done + + # Use savedmodel for addsub ensemble + mkdir -p $MODELSDIR/simple/1 + cp -r $DATADIR/savedmodel_float32_float32_float32/1/* $MODELSDIR/simple/1/. + cp $DATADIR/savedmodel_float32_float32_float32/config.pbtxt $MODELSDIR/simple/. + (cd $MODELSDIR/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt && \ + sed -i "s/label_filename:.*//" config.pbtxt) + + # set up "addsub" ensemble + ENSEMBLEDIR=$DATADIR/../qa_ensemble_model_repository/qa_model_repository/ + rm -rf $MODELSDIR + mkdir -p $MODELSDIR/simple/1 && \ + cp $ENSEMBLEDIR/fan_plan_float32_float32_float32/config.pbtxt $MODELSDIR/simple/. && \ + (cd $MODELSDIR/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt && \ + sed -i "s/label_filename:.*//" config.pbtxt) + + cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \ + mkdir -p $MODELSDIR/nop_TYPE_FP32_-1/1 + + cp -r $DATADIR/plan_float32_float32_float32 $MODELSDIR/. && \ + # make sure version 1 is used (no swap) + rm -r $MODELSDIR/plan_float32_float32_float32/2 && \ + rm -r $MODELSDIR/plan_float32_float32_float32/3 + full=ensemble + + set +e + + # No memory type enforcement + $SIMPLE_CLIENT -r $MODELSDIR >>$CLIENT_LOG.$full.log 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG.$full.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + # Enforce I/O to be in specific memory type + for MEM_TYPE in system pinned gpu ; do + $SIMPLE_CLIENT -r $MODELSDIR -m $MEM_TYPE >>$CLIENT_LOG.$full.$MEM_TYPE.log 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG.$full.$MEM_TYPE.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + done + + # For GPU input / output case, all ensemble allocation should be on GPU + if grep ^I[0-9][0-9][0-9][0-9].*"Internal response".*"memory type 0" $CLIENT_LOG.$full.gpu.log; then + echo -e "\n*** FAILED: unexpected CPU allocation for ensemble" >> $CLIENT_LOG.$full.gpu.log + cat $CLIENT_LOG.$full.gpu.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + + set -e +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_simple_nodejs_client/test.sh b/qa/L0_simple_nodejs_client/test.sh new file mode 100755 index 0000000000..871c793bf9 --- /dev/null +++ b/qa/L0_simple_nodejs_client/test.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:="http://github.com/triton-inference-server"} +TRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG:="main"} + +SIMPLE_NODEJS_CLIENT=client.js + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS=--model-repository=`pwd`/models +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f *.log + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +# Get the proto files from the common repo +rm -fr common +git clone --single-branch --depth=1 -b $TRITON_COMMON_REPO_TAG \ + ${TRITON_REPO_ORGANIZATION}/common.git +mkdir proto && cp common/protobuf/*.proto proto/. + +npm install + +set +e + +# Runs test for GRPC variant of nodejs client +node $SIMPLE_NODEJS_CLIENT >> client.log 2>&1 +if [ $? -ne 0 ]; then + RET=1 +fi + +if [ `grep -c "Checking Inference Output" client.log` != "1" ]; then + echo -e "\n***\n*** Failed. Unable to run inference.\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_socket/models/simple/config.pbtxt b/qa/L0_socket/models/simple/config.pbtxt new file mode 100644 index 0000000000..838edd5d55 --- /dev/null +++ b/qa/L0_socket/models/simple/config.pbtxt @@ -0,0 +1,53 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple" +platform: "tensorflow_graphdef" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] diff --git a/qa/L0_socket/test.sh b/qa/L0_socket/test.sh new file mode 100755 index 0000000000..2fd37bd054 --- /dev/null +++ b/qa/L0_socket/test.sh @@ -0,0 +1,416 @@ +#!/bin/bash +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +SERVER_LOG="./inference_server.log" + +DATADIR=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_TIMEOUT=15 +source ../common/util.sh + +rm -f *.log + +RET=0 + +# CUSTOM CASES +for address in default explicit; do + if [ "$address" == "default" ]; then + # without specifying address, will use "0.0.0.0" as default + SAME_EXPLICIT_ADDRESS="" + DIFF_EXPLICIT_ADDRESS_ARGS="" + else + SAME_EXPLICIT_ADDRESS="--http-address 127.0.0.1 --grpc-address 127.0.0.1 --metrics-address 127.0.0.1" + DIFF_EXPLICIT_ADDRESS="--http-address 127.0.0.1 --grpc-address 127.0.0.2 --metrics-address 127.0.0.3" + fi + + for p in http grpc; do + if [ "$address" == "default" ]; then + # allow illegal http/grpc port if disabled + SERVER_ARGS="--model-repository=$DATADIR --${p}-port -47 --allow-${p} 0" + else + # allow illegal http/grpc address if disabled + SERVER_ARGS="--model-repository=$DATADIR --${p}-address -47 --allow-${p} 0" + fi + run_server_nowait + sleep 15 + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + kill $SERVER_PID + wait $SERVER_PID + + # allow http/grpc port overlap with grpc/http default if disabled + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8001 --allow-http 0" + run_server_nowait + sleep 15 + else + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --grpc-port 8000 --allow-grpc 0" + run_server + fi + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + kill $SERVER_PID + wait $SERVER_PID + + # error if http/grpc port overlaps with grpc/http default port + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8001" + else + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --grpc-port 8000" + fi + run_server + if [ "$SERVER_PID" != "0" ]; then + set +e + kill $SERVER_PID + wait $SERVER_PID + if [ "$?" == "0" ]; then + echo -e "\n***\n*** unexpected start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + set -e + fi + + # when using different addresses, allow http/grpc port overlap with grpc/http default port + if [ "$address" == "explicit" ]; then + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $DIFF_EXPLICIT_ADDRESS --http-port 8001" + else + SERVER_ARGS="--model-repository=$DATADIR $DIFF_EXPLICIT_ADDRESS --grpc-port 8000" + fi + run_server_nowait + sleep 15 + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + kill $SERVER_PID + wait $SERVER_PID + fi + + # allow http/grpc port overlap with grpc/http explicit if disabled + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8007 --grpc-port 8007 --allow-http 0" + else + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --grpc-port 8007 --http-port 8007 --allow-grpc 0" + fi + run_server_nowait + sleep 15 + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + kill $SERVER_PID + wait $SERVER_PID + + # error if http/grpc port overlaps with grpc/http explicit port + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8003 --grpc-port 8003" + run_server_nowait + sleep 15 + if [ "$SERVER_PID" != "0" ]; then + set +e + kill $SERVER_PID + wait $SERVER_PID + if [ "$?" == "0" ]; then + echo -e "\n***\n*** unexpected start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + set -e + fi + else + # skip, same as http case + true + fi + + # when using different addresses, allow http/grpc port overlap with grpc/http explicit + if [ "$address" == "explicit" ]; then + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $DIFF_EXPLICIT_ADDRESS --http-port 8007 --grpc-port 8007" + else + SERVER_ARGS="--model-repository=$DATADIR $DIFF_EXPLICIT_ADDRESS --grpc-port 8007 --http-port 8007" + fi + run_server_nowait + sleep 15 + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + code=`curl -s -w %{http_code} 127.0.0.1:8007/v2/health/ready` + if [ "$code" != "200" ]; then + echo -e "\n***\n*** Server is not ready\n***" + RET=1 + fi + kill $SERVER_PID + wait $SERVER_PID + fi + + # allow http/grpc port overlap with metrics default port if disabled + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8002 --allow-http 0" + run_server_nowait + sleep 15 + else + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --grpc-port 8002 --allow-grpc 0" + run_server + fi + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + kill $SERVER_PID + wait $SERVER_PID + + # error if http/grpc port overlaps with metrics default port + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8002" + else + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --grpc-port 8002" + fi + run_server + if [ "$SERVER_PID" != "0" ]; then + set +e + kill $SERVER_PID + wait $SERVER_PID + if [ "$?" == "0" ]; then + echo -e "\n***\n*** unexpected start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + set -e + fi + + # when using different addresses, allow grpc port overlap with metrics default port + if [ "$address" == "explicit" ]; then + if [ "$p" == "grpc" ]; then + SERVER_ARGS="--model-repository=$DATADIR $DIFF_EXPLICIT_ADDRESS --grpc-port 8002" + run_server_nowait + sleep 15 + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + code=`curl -s -w %{http_code} 127.0.0.1:8000/v2/health/ready` + if [ "$code" != "200" ]; then + echo -e "\n***\n*** Server is not ready\n***" + RET=1 + fi + kill $SERVER_PID + wait $SERVER_PID + else + # http and metrics server bind to the same address, should skip this test case. + true + fi + fi + + # allow metrics port overlap with http/grpc default port if disabled + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --metrics-port 8000 --allow-metrics 0" + else + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --metrics-port 8001 --allow-metrics 0" + fi + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + kill $SERVER_PID + wait $SERVER_PID + + # error if metrics port overlaps with http/grpc default port + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --metrics-port 8000" + else + SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --metrics-port 8001" + fi + run_server + if [ "$SERVER_PID" != "0" ]; then + set +e + kill $SERVER_PID + wait $SERVER_PID + if [ "$?" == "0" ]; then + echo -e "\n***\n*** unexpected start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + set -e + fi + + # when using different addresses, allow metrics port overlap with grpc default port + if [ "$address" == "explicit" ]; then + if [ "$p" == "grpc" ]; then + SERVER_ARGS="--model-repository=$DATADIR $DIFF_EXPLICIT_ADDRESS --metrics-port 8001" + run_server_nowait + sleep 15 + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + code=`curl -s -w %{http_code} 127.0.0.1:8000/v2/health/ready` + if [ "$code" != "200" ]; then + echo -e "\n***\n*** Server is not ready\n***" + RET=1 + fi + kill $SERVER_PID + wait $SERVER_PID + else + # http and metrics server bind to the same address, should skip this test case. + true + fi + fi + done +done + +# Test multiple servers binding to the same http/grpc port +SERVER0_LOG="./inference_server0.log" +SERVER1_LOG="./inference_server1.log" +SERVER2_LOG="./inference_server2.log" + +for p in http grpc; do + # error if servers bind to the same http/grpc port without setting the reuse flag + if [ "$p" == "http" ]; then + SERVER_ARGS="--model-repository=$DATADIR --metrics-port 8002 --reuse-grpc-port=true" + SERVER0_ARGS="--model-repository=$DATADIR --metrics-port 8003 --reuse-grpc-port=true" + SERVER1_ARGS="--model-repository=$DATADIR --metrics-port 8004 --reuse-grpc-port=true" + else + SERVER_ARGS="--model-repository=$DATADIR --metrics-port 8002 --reuse-http-port=true" + SERVER0_ARGS="--model-repository=$DATADIR --metrics-port 8003 --reuse-http-port=true" + SERVER1_ARGS="--model-repository=$DATADIR --metrics-port 8004 --reuse-http-port=true" + fi + # make sure the first server is launched successfully, then run the other + # two servers and expect them to fail + run_server + run_multiple_servers_nowait 2 + sleep 15 + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start SERVER $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + if [ "$SERVER1_PID" != "0" ]; then + set +e + kill $SERVER0_PID + wait $SERVER0_PID + if [ "$?" == "0" ]; then + echo -e "\n***\n*** unexpected start SERVER0 $SERVER\n***" + cat $SERVER0_LOG + exit 1 + fi + set -e + fi + if [ "$SERVER1_PID" != "0" ]; then + set +e + kill $SERVER1_PID + wait $SERVER1_PID + if [ "$?" == "0" ]; then + echo -e "\n***\n*** unexpected start SERVER1 $SERVER\n***" + cat $SERVER1_LOG + exit 1 + fi + set -e + fi + kill_server + + # 1. Allow multiple servers bind to the same http/grpc port with setting the reuse flag + # 2. Test different forms of setting --metrics-address and verify metrics are queryable + # (a) Test default metrics-address being same as http-address + # (b) Test setting metrics-address explicitly to 0.0.0.0 + # (c) Test setting metrics-address explicitly to 127.0.0.1 + SERVER0_ARGS="--model-repository=$DATADIR --metrics-port 8002 --reuse-http-port=true --reuse-grpc-port=true" + SERVER1_ARGS="--model-repository=$DATADIR --metrics-address 0.0.0.0 --metrics-port 8003 --reuse-http-port=true --reuse-grpc-port=true" + SERVER2_ARGS="--model-repository=$DATADIR --metrics-address 127.0.0.2 --metrics-port 8004 --reuse-http-port=true --reuse-grpc-port=true" + run_multiple_servers_nowait 3 + sleep 15 + if [ "$SERVER0_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start SERVER0 $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + if [ "$SERVER1_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start SERVER1 $SERVER\n***" + cat $SERVER1_LOG + exit 1 + fi + if [ "$SERVER2_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start SERVER2 $SERVER\n***" + cat $SERVER2_LOG + exit 1 + fi + + set +e + + # test if requests are being distributed among three servers + if [ "$p" == "http" ]; then + CLIENT_PY=../clients/simple_http_infer_client.py + else + CLIENT_PY=../clients/simple_grpc_infer_client.py + fi + + pids=() + for i in {0..10}; do + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 & + pids+=" $!" + done + wait $pids || { echo -e "\n***\n*** Python ${p} Async Infer Test Failed\n***"; cat $CLIENT_LOG; RET=1; } + + set -e + + server0_request_count=`curl -s localhost:8002/metrics | awk '/nv_inference_request_success{/ {print $2}'` + server1_request_count=`curl -s localhost:8003/metrics | awk '/nv_inference_request_success{/ {print $2}'` + server2_request_count=`curl -s 127.0.0.2:8004/metrics | awk '/nv_inference_request_success{/ {print $2}'` + if [ ${server0_request_count%.*} -eq 0 ] || \ + [ ${server1_request_count%.*} -eq 0 ] || \ + [ ${server2_request_count%.*} -eq 0 ]; then + echo -e "\n***\n*** Failed: ${p} requests are not distributed among all servers.\n***" + RET=1 + fi + kill_servers +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi +exit $RET diff --git a/qa/L0_storage_S3/test.sh b/qa/L0_storage_S3/test.sh new file mode 100755 index 0000000000..f16dc81e83 --- /dev/null +++ b/qa/L0_storage_S3/test.sh @@ -0,0 +1,529 @@ +#!/bin/bash +# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG_BASE="./client" +INFER_TEST="../common/infer_test.py" +EXPECTED_NUM_TESTS="3" +TEST_RESULT_FILE='test_results.txt' + +# S3 credentials are necessary for this test. Pass via ENV variables +aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + +# S3 bucket path (Point to bucket when testing cloud storage) +BUCKET_URL="s3://triton-bucket-${CI_JOB_ID}" + +# Cleanup and delete S3 test bucket if it already exists (due to test failure) +aws s3 rm $BUCKET_URL --recursive --include "*" && \ + aws s3 rb $BUCKET_URL || true + +# Make S3 test bucket +aws s3 mb "${BUCKET_URL}" + +# Remove Slash in BUCKET_URL +BUCKET_URL=${BUCKET_URL%/} +BUCKET_URL_SLASH="${BUCKET_URL}/" + +# Backup S3 credentials as they will be unset during the test +AWS_DEFAULT_REGION_BACKUP=$AWS_DEFAULT_REGION +AWS_ACCESS_KEY_ID_BACKUP=$AWS_ACCESS_KEY_ID +AWS_SECRET_ACCESS_KEY_BACKUP=$AWS_SECRET_ACCESS_KEY + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_TIMEOUT=600 + +SERVER_LOG_BASE="./inference_server" +source ../common/util.sh + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG_BASE* +RET=0 + +# Test 3 Scenarios: +# 1. Only AWS ENV vars (Without aws configure) +# 2. AWS ENV vars + dummy values in aws configure [ENV vars have higher priority] +# 3. Only AWS configured (Without AWS ENV vars) +for ENV_VAR in "env" "env_dummy" "config"; do + SERVER_LOG=$SERVER_LOG_BASE.$ENV_VAR.log + CLIENT_LOG=$CLIENT_LOG_BASE.$ENV_VAR.log + + if [ "$ENV_VAR" == "config" ]; then + unset AWS_ACCESS_KEY_ID + unset AWS_SECRET_ACCESS_KEY + unset AWS_DEFAULT_REGION + elif [ "$ENV_VAR" == "env_dummy" ]; then + aws configure set default.region "dummy_region" && \ + aws configure set aws_access_key_id "dummy_id" && \ + aws configure set aws_secret_access_key "dummy_key" + else + rm ~/.aws/credentials && rm ~/.aws/config + fi + + # Construct model repository + + KIND="KIND_GPU" + + # Test coverage for extra slashes + for MAYBE_SLASH in "" "/" "//"; do + + ROOT_REPO="$BUCKET_URL$MAYBE_SLASH" + MODEL_REPO="${BUCKET_URL}/${MAYBE_SLASH}models${MAYBE_SLASH}" + + # copy models in model directory + rm -rf models && mkdir -p models + + # perform empty repo tests + + SERVER_ARGS="--model-repository=$ROOT_REPO --exit-timeout-secs=120" + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + kill $SERVER_PID + wait $SERVER_PID + + # run with a non-root empty model repo + touch models/dummy + if [ "$ENV_VAR" != "config" ]; then + aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + fi + aws s3 cp . "$BUCKET_URL_SLASH" --recursive --include "*" + if [ "$ENV_VAR" == "env_dummy" ]; then + aws configure set default.region "dummy_region" && \ + aws configure set aws_access_key_id "dummy_id" && \ + aws configure set aws_secret_access_key "dummy_key" + elif [ "$ENV_VAR" == "env" ]; then + rm ~/.aws/credentials && rm ~/.aws/config + fi + + SERVER_ARGS="--model-repository=$MODEL_REPO --exit-timeout-secs=120" + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + kill $SERVER_PID + wait $SERVER_PID + + if [ "$ENV_VAR" != "config" ]; then + aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + fi + aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" + rm models/dummy + + # Now start model tests + + for FW in graphdef savedmodel onnx libtorch plan; do + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/${FW}_float32_float32_float32/ models/ + done + + # Copy models with string inputs and remove nobatch (bs=1) models + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/*_object_object_object/ models/ + rm -rf models/*nobatch* + + for FW in graphdef savedmodel onnx libtorch plan; do + for MC in `ls models/${FW}*/config.pbtxt`; do + echo "instance_group [ { kind: ${KIND} }]" >> $MC + done + done + + # now traverse the tree and create empty version directories that the CLI skips + for dir in `ls models/`; do + for subdir in `ls models/$dir`; do + if [ -d models/$dir/$subdir ] && [ -z "$(ls models/$dir/$subdir)" ]; then + touch models/$dir/$subdir/$subdir + fi + done + done + + # Perform test with model repository variants + for src in "models/" "." ; do + + # copy contents of /models into S3 bucket. + aws s3 cp $src $BUCKET_URL_SLASH --recursive --include "*" + if [ "$ENV_VAR" == "env_dummy" ]; then + aws configure set default.region "dummy_region" && \ + aws configure set aws_access_key_id "dummy_id" && \ + aws configure set aws_secret_access_key "dummy_key" + elif [ "$ENV_VAR" == "env" ]; then + rm ~/.aws/credentials && rm ~/.aws/config + fi + + if [ "$src" == "." ]; then + # set server arguments + SERVER_ARGS="--model-repository=$MODEL_REPO --exit-timeout-secs=120" + else + # set server arguments + SERVER_ARGS="--model-repository=$ROOT_REPO --exit-timeout-secs=120" + fi + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python $INFER_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + + # Clean up bucket + if [ "$ENV_VAR" != "config" ]; then + aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + fi + aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" + done + done +done + +# Restore S3 credentials +rm ~/.aws/credentials && rm ~/.aws/config +export AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION_BACKUP +export AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID_BACKUP +export AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY_BACKUP +aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + +# Test with polling enabled +SERVER_ARGS="--model-repository=$ROOT_REPO --exit-timeout-secs=120 --model-control-mode=poll" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# copy contents of /models into S3 bucket and wait for them to be loaded. +aws s3 cp models/ "${BUCKET_URL_SLASH}" --recursive --include "*" +sleep 600 + +set +e + +python $INFER_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test localization to a specified location +export TRITON_AWS_MOUNT_DIRECTORY=`pwd`/aws_localization_test + +if [ -d "$TRITON_AWS_MOUNT_DIRECTORY" ]; then + rm -rf $TRITON_AWS_MOUNT_DIRECTORY +fi + +mkdir -p $TRITON_AWS_MOUNT_DIRECTORY + +SERVER_LOG=$SERVER_LOG_BASE.custom_localization.log +SERVER_ARGS="--model-repository=$ROOT_REPO --exit-timeout-secs=120" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +if [ -z "$(ls -A $TRITON_AWS_MOUNT_DIRECTORY)" ]; then + echo -e "\n***\n*** Test localization to a specified location failed. \n***" + echo -e "\n***\n*** Specified mount folder $TRITON_AWS_MOUNT_DIRECTORY is empty \n***" + ls -A $TRITON_AWS_MOUNT_DIRECTORY + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +if [ -d "$TRITON_AWS_MOUNT_DIRECTORY" ] && [ ! -z "$(ls -A $TRITON_AWS_MOUNT_DIRECTORY)" ]; then + echo -e "\n***\n*** Test localization to a specified location failed. \n***" + echo -e "\n***\n*** Specified mount folder $TRITON_AWS_MOUNT_DIRECTORY was not cleared properly. \n***" + ls -A $TRITON_AWS_MOUNT_DIRECTORY + exit 1 +fi + +rm -rf $TRITON_AWS_MOUNT_DIRECTORY +unset TRITON_AWS_MOUNT_DIRECTORY + +# Save models for AWS_SESSION_TOKEN test +rm -rf tmp_cred_test_models +mv models tmp_cred_test_models +# Clean up bucket contents +aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" + +# Test reload of model with explicit model control +rm -rf models && mkdir -p models/libtorch_float32_float32_float32 && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/libtorch_float32_float32_float32/1 models/libtorch_float32_float32_float32/. && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/libtorch_float32_float32_float32/config.pbtxt models/libtorch_float32_float32_float32/. + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/libtorch_float32_float32_float32/output0_labels.txt models/libtorch_float32_float32_float32/. + +# Remove version policy from config.pbtxt +sed -i '/^version_policy/d' models/libtorch_float32_float32_float32/config.pbtxt + +# Copy contents of models into S3 bucket +aws s3 cp models/ "${BUCKET_URL_SLASH}" --recursive --include "*" + +SERVER_ARGS="--model-repository=$BUCKET_URL --exit-timeout-secs=120 --model-control-mode=explicit" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +curl -X POST localhost:8000/v2/repository/models/libtorch_float32_float32_float32/load + +CURL_LOG=$(curl -X POST localhost:8000/v2/repository/index) + +if [ "$CURL_LOG" != "[{\"name\":\"libtorch_float32_float32_float32\",\"version\":\"1\",\"state\":\"READY\"}]" ]; then + RET=1 +fi + +# Add new model version +aws s3 cp /data/inferenceserver/${REPO_VERSION}/qa_model_repository/libtorch_float32_float32_float32/3 "${BUCKET_URL_SLASH}libtorch_float32_float32_float32/3" --recursive --include "*" + +curl -X POST localhost:8000/v2/repository/models/libtorch_float32_float32_float32/load + +CURL_LOG=$(curl -X POST localhost:8000/v2/repository/index) +if [ "$CURL_LOG" != "[{\"name\":\"libtorch_float32_float32_float32\",\"version\":\"1\",\"state\":\"UNAVAILABLE\",\"reason\":\"unloaded\"},{\"name\":\"libtorch_float32_float32_float32\",\"version\":\"3\",\"state\":\"READY\"}]" ]; then + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Clean up bucket contents +aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" + +# Test with temporary credential (AWS_SESSION_TOKEN) +AWS_GET_SESSION_TOKEN_RES=`aws sts get-session-token --duration-seconds 900` && \ + export AWS_ACCESS_KEY_ID=`echo $AWS_GET_SESSION_TOKEN_RES | jq -r ".Credentials.AccessKeyId"` && \ + export AWS_SECRET_ACCESS_KEY=`echo $AWS_GET_SESSION_TOKEN_RES | jq -r ".Credentials.SecretAccessKey"` && \ + export AWS_SESSION_TOKEN=`echo $AWS_GET_SESSION_TOKEN_RES | jq -r ".Credentials.SessionToken"` +rm ~/.aws/credentials && rm ~/.aws/config +aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY && \ + aws configure set aws_session_token $AWS_SESSION_TOKEN + +# Copy models into S3 bucket +aws s3 cp tmp_cred_test_models/ "${BUCKET_URL_SLASH}" --recursive --include "*" + +SERVER_LOG=$SERVER_LOG_BASE.temporary_credentials_test.log +SERVER_ARGS="--model-repository=$BUCKET_URL --exit-timeout-secs=120" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $INFER_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test access decline +export AWS_SECRET_ACCESS_KEY="[Invalid]" && export AWS_SESSION_TOKEN="" +SERVER_LOG=$SERVER_LOG_BASE.access_decline_test.log +SERVER_ARGS="--model-repository=$BUCKET_URL --exit-timeout-secs=120" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected server start $SERVER\n***" + cat $SERVER_LOG + kill $SERVER_PID + wait $SERVER_PID + RET=1 +else + # AWS S3 does not appear to reply on access decline, but other implementations + # might provide extra messages, so make sure Triton will print the messages. + EXPECTED_MSG="Unable to create S3 filesystem client. Check account credentials. Exception: '' Message: 'No response body.'" + if ! grep "$EXPECTED_MSG" $SERVER_LOG; then + echo -e "\n***\n*** Expected error message not found\n***" + cat $SERVER_LOG + RET=1 + fi +fi + +# Restore S3 credentials +rm ~/.aws/credentials && rm ~/.aws/config +export AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION_BACKUP +export AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID_BACKUP +export AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY_BACKUP +aws configure set default.region $AWS_DEFAULT_REGION && \ + aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID && \ + aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + +# Clean up bucket contents +aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" + +# Test case where S3 folder has >1000 files +rm -rf models + +mkdir -p models/model/1 +# Create Python model that reads the number of files in the +# model directory when loaded +echo "import os + +class TritonPythonModel: + + def initialize(self, args): + count = 0 + model_dir = args['model_repository'] + for path in os.listdir(model_dir): + if os.path.isfile(os.path.join(model_dir, path)): + count += 1 + print('Found {} files in model directory'.format(count)) + + def execute(self): + pass" > models/model/1/model.py + +for i in {1..1050}; do + touch models/model/0${i}.txt +done + +# Provide extended timeout to allow >1000 files to be loaded +SERVER_ARGS="--model-repository=$BUCKET_URL --exit-timeout-secs=600 --model-control-mode=none" +SERVER_LOG=$SERVER_LOG_BASE.many_files.log + +# copy contents of /models into S3 bucket and wait for them to be loaded. +aws s3 cp models/ "${BUCKET_URL_SLASH}" --recursive --include "*" + +# Test that the server starts up. Files will be loaded in numerically +# ascending order, so the model file is loaded after the first 1000 +# files. If AWS fails to load >1000 files, the model file will not +# be loaded and the server will fail to start. + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +# Confirm the correct number of files loaded +EXPECTED_MSG="Found 1050 files in model directory" +if ! grep "$EXPECTED_MSG" $SERVER_LOG; then +echo -e "\n***\n*** Expected file count message not found\n***" +cat $SERVER_LOG +RET=1 +fi + +# Clean up bucket contents and delete bucket +aws s3 rm "${BUCKET_URL_SLASH}" --recursive --include "*" +aws s3 rb "${BUCKET_URL}" + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_storage_S3_local/mock_s3_service.py b/qa/L0_storage_S3_local/mock_s3_service.py new file mode 100755 index 0000000000..956aac0e66 --- /dev/null +++ b/qa/L0_storage_S3_local/mock_s3_service.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import threading +import time +from http.server import BaseHTTPRequestHandler, HTTPServer + + +class MockS3Service: + __address = "localhost" + __port = 8080 + + def __init__(self): + # Test passed when: + # - at least one HEAD request is received; and + # - at least one GET request is received; and + # - all received requests do not advertise for HTTP/2. + test_results = {"head_count": 0, "get_count": 0, "http2_ads": False} + + class RequestValidator(BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + + def __CheckHttp2Ads(self): + if "connection" in self.headers: + v = self.headers["connection"].lower() + if "upgrade" in v or "http2" in v: + test_results["http2_ads"] = True + if ( + "upgrade" in self.headers + and "h2c" in self.headers["upgrade"].lower() + ): + test_results["http2_ads"] = True + if "http2-settings" in self.headers: + test_results["http2_ads"] = True + + def do_HEAD(self): + self.__CheckHttp2Ads() + test_results["head_count"] += 1 + self.send_response(200) + self.end_headers() + + def do_GET(self): + self.__CheckHttp2Ads() + test_results["get_count"] += 1 + self.send_error( + 404, + "Thank you for using the mock s3 service!", + "Your bucket is not found here!", + ) + + self.__test_results = test_results + self.__server = HTTPServer((self.__address, self.__port), RequestValidator) + self.__service_thread = threading.Thread(target=self.__server.serve_forever) + + def __enter__(self): + self.__service_thread.start() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.__server.shutdown() + self.__server.server_close() + self.__service_thread.join() + + def TestPassed(self): + return ( + self.__test_results["head_count"] > 0 + and self.__test_results["get_count"] > 0 + and not self.__test_results["http2_ads"] + ) + + +if __name__ == "__main__": + # Initialize mock service + mock_s3_service = MockS3Service() + + # Start service and poll until test passed or timed-out + with mock_s3_service: + poll_interval = 1 # seconds + timeout = 10 # seconds + elapsed_time = 0 # seconds + while not mock_s3_service.TestPassed() and elapsed_time < timeout: + elapsed_time += poll_interval + time.sleep(poll_interval) + + # Print the result + if mock_s3_service.TestPassed(): + print("TEST PASSED") + else: + print("TEST FAILED") diff --git a/qa/L0_storage_S3_local/test.sh b/qa/L0_storage_S3_local/test.sh new file mode 100755 index 0000000000..e60b106b31 --- /dev/null +++ b/qa/L0_storage_S3_local/test.sh @@ -0,0 +1,387 @@ +#!/bin/bash +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +TEST_RESULT_FILE='test_results.txt' +INFER_TEST="../common/infer_test.py" +EXPECTED_NUM_TESTS="3" + +DATADIR="/data/inferenceserver/${REPO_VERSION}/qa_model_repository" +# Used to control which backends are run in infer_test.py +BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch plan"} + +function run_unit_tests() { + echo "Running unit tests: ${INFER_TEST}" + python $INFER_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi +} + +function setup_model_repo() { + model_repo=${1:-"models"} + backends=${2:-${BACKENDS}} + types=${3:-"float32_float32_float32 object_object_object"} + echo "[setup_model_repo] model_repo: ${model_repo}, backends: ${backends}" + rm -rf ${model_repo} && mkdir ${model_repo} + for BACKEND in ${backends}; do + for TYPE in ${types}; do + model="${BACKEND}_${TYPE}" + echo "Copying ${DATADIR}/${model} to ${model_repo}." + cp -r "${DATADIR}/${model}" "${model_repo}/" + # Remove version policy from config.pbtxt + sed -i '/^version_policy/d' ${model_repo}/${model}/config.pbtxt + done + done +} + +function load_models() { + model_repo=${1:-"models"} + for model in `ls ${model_repo}`; do + echo "Loading model: ${model}" + code=`curl -s -w %{http_code} -X POST localhost:8000/v2/repository/models/${model}/load` + if [ "$code" != "200" ]; then + echo -e "\n***\n*** Test Failed. Failed to load model: ${model}\n***" + RET=1 + fi + done +} + +set +e +setup_model_repo +set -e + +# Create model with name that has all types of allowed characters +DUMMY_MODEL="Model_repo-1.0" +cp -r models/libtorch_float32_float32_float32 models/$DUMMY_MODEL +sed -i 's/libtorch_float32_float32_float32/Model_repo-1.0/g' models/$DUMMY_MODEL/config.pbtxt + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +rm -f *.log* + +## Setup local MINIO server +(wget https://dl.min.io/server/minio/release/linux-amd64/minio && \ + chmod +x minio && \ + mv minio /usr/local/bin && \ + mkdir /usr/local/share/minio && \ + mkdir /etc/minio) + +export MINIO_ACCESS_KEY="minio" +# Specify MINIO CI env to allow using root disk +# https://github.com/minio/minio/issues/15030 +export MINIO_CI_CD=true +MINIO_VOLUMES="/usr/local/share/minio/" +MINIO_OPTS="-C /etc/minio --address 127.0.0.1:4572" +export MINIO_SECRET_KEY="miniostorage" + +(curl -O https://raw.githubusercontent.com/minio/minio-service/master/linux-systemd/minio.service && \ + mv minio.service /etc/systemd/system) + +# Start minio server +/usr/local/bin/minio server $MINIO_OPTS $MINIO_VOLUMES & +MINIO_PID=$! + +export AWS_ACCESS_KEY_ID=minio && \ + export AWS_SECRET_ACCESS_KEY=miniostorage + +# Force version to 0.07 to prevent failures due to version changes +python -m pip install awscli-local==0.07 + +# Needed to set correct port for awscli-local +ENDPOINT_FLAG="--endpoint-url=http://localhost:4572" + +# Cleanup bucket if exists +awslocal $ENDPOINT_FLAG s3 rm s3://demo-bucket1.0 --recursive --include "*" && \ + awslocal $ENDPOINT_FLAG s3 rb s3://demo-bucket1.0 || true + +# Create and add data to bucket +awslocal $ENDPOINT_FLAG s3 mb s3://demo-bucket1.0 && \ + awslocal $ENDPOINT_FLAG s3 sync models s3://demo-bucket1.0 + +RET=0 + +# Test with hostname and IP address +echo "=== Running hostname/IP tests ===" +for HOST in "127.0.0.1" "localhost"; do + SERVER_ARGS="--model-repository=s3://$HOST:4572/demo-bucket1.0 --model-control-mode=explicit" + if [ "$HOST" = "127.0.0.1" ]; then + SERVER_LOG="./inference_server_hostname.log" + else + SERVER_LOG="./inference_server_ip.log" + fi + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + # Kill minio server + kill $MINIO_PID + wait $MINIO_PID + exit 1 + fi + + set +e + load_models + run_unit_tests + + # Try to load model with name that checks for all types of allowed characters + code=`curl -s -w %{http_code} -X POST localhost:8000/v2/repository/models/${DUMMY_MODEL}/load` + if [ "$code" != "200" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test with Polling +echo "=== Running polling tests ===" +SERVER_ARGS="--model-repository=s3://localhost:4572/demo-bucket1.0 --model-control-mode=poll" +SERVER_LOG="./inference_server_poll.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + # Kill minio server + kill $MINIO_PID + wait $MINIO_PID + exit 1 +fi + +cp -r models/libtorch_float32_float32_float32/1 models/libtorch_float32_float32_float32/4 +awslocal $ENDPOINT_FLAG s3 sync models s3://demo-bucket1.0 + +sleep 20 + +set +e +CURL_LOG=$(curl -X POST localhost:8000/v2/repository/index) +if [[ "$CURL_LOG" != *"{\"name\":\"libtorch_float32_float32_float32\",\"version\":\"3\",\"state\":\"UNAVAILABLE\",\"reason\":\"unloaded\"}"* ]]; then + echo -e "\n***\n*** Failed. Server did not unload libtorch_float32_float32_float32 version 3\n***" + RET=1 +fi + +if [[ "$CURL_LOG" != *"{\"name\":\"libtorch_float32_float32_float32\",\"version\":\"4\",\"state\":\"READY\"}"* ]]; then + echo -e "\n***\n*** Failed. Server did not load libtorch_float32_float32_float32 version 4\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Destroy bucket +awslocal $ENDPOINT_FLAG s3 rm s3://demo-bucket1.0 --recursive --include "*" && \ + awslocal $ENDPOINT_FLAG s3 rb s3://demo-bucket1.0 + +# Test with Polling, no model configuration file - with strict model config disabled +echo "=== Running autocomplete tests ===" +AUTOCOMPLETE_BACKENDS="savedmodel" +export BACKENDS=${AUTOCOMPLETE_BACKENDS} + +set +e +setup_model_repo + +TYPES="float32_float32_float32 object_object_object" +for BACKEND in ${AUTOCOMPLETE_BACKENDS}; do + for TYPE in ${TYPES}; do + model="${BACKEND}_${TYPE}" + # Config files specify things expected by unit test like label_filename + # and max_batch_size for comparing results, so remove some key fields + # for autocomplete to fill that won't break the unit test. + sed -i '/platform:/d' models/${model}/config.pbtxt + sed -i '/data_type:/d' models/${model}/config.pbtxt + sed -i '/dims:/d' models/${model}/config.pbtxt + done +done +set -e + +awslocal $ENDPOINT_FLAG s3 mb s3://demo-bucket1.0 && \ + awslocal $ENDPOINT_FLAG s3 sync models s3://demo-bucket1.0 + +SERVER_ARGS="--model-repository=s3://localhost:4572/demo-bucket1.0 --model-control-mode=poll --strict-model-config=false" +SERVER_LOG="./inference_server_noconfig.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + # Kill minio server + kill $MINIO_PID + wait $MINIO_PID + exit 1 +fi + +run_unit_tests + +kill $SERVER_PID +wait $SERVER_PID + +# Destroy bucket +awslocal $ENDPOINT_FLAG s3 rm s3://demo-bucket1.0 --recursive --include "*" && \ + awslocal $ENDPOINT_FLAG s3 rb s3://demo-bucket1.0 + +# Test for multiple model repositories using S3 cloud storage +echo "=== Running multiple-model-repository tests ===" +BACKENDS1="graphdef libtorch" +BACKENDS2="onnx plan savedmodel" +export BACKENDS="$BACKENDS1 $BACKENDS2" + +set +e +setup_model_repo "models1" "${BACKENDS1}" +setup_model_repo "models2" "${BACKENDS2}" +set -e + +BUCKET_NAME="demo-bucket" +MODEL_REPO_ARGS="" +for BUCKET_SUFFIX in 1 2; do + # Cleanup bucket if exists + awslocal $ENDPOINT_FLAG s3 rm s3://$BUCKET_NAME$BUCKET_SUFFIX --recursive --include "*" && \ + awslocal $ENDPOINT_FLAG s3 rb s3://$BUCKET_NAME$BUCKET_SUFFIX || true + + # Create and add data to bucket + awslocal $ENDPOINT_FLAG s3 mb s3://$BUCKET_NAME$BUCKET_SUFFIX && \ + awslocal $ENDPOINT_FLAG s3 sync models$BUCKET_SUFFIX s3://$BUCKET_NAME$BUCKET_SUFFIX + + MODEL_REPO_ARGS="$MODEL_REPO_ARGS --model-repository=s3://localhost:4572/$BUCKET_NAME$BUCKET_SUFFIX" +done + +SERVER_ARGS="$MODEL_REPO_ARGS --model-control-mode=explicit" +SERVER_LOG="./inference_server.multi.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + # Kill minio server + kill $MINIO_PID + wait $MINIO_PID + exit 1 +fi + +set +e +load_models "models1" +load_models "models2" +run_unit_tests +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test access decline +AWS_SECRET_ACCESS_KEY_BACKUP=$AWS_SECRET_ACCESS_KEY +export AWS_SECRET_ACCESS_KEY="[Invalid]" +SERVER_ARGS="--model-repository=s3://localhost:4572/${BUCKET_NAME}1 --exit-timeout-secs=120" +SERVER_LOG="./inference_server.access_decline.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected server start $SERVER\n***" + cat $SERVER_LOG + kill $SERVER_PID + wait $SERVER_PID + RET=1 +else + # MinIO does not appear to reply on access decline, but other implementations + # might provide extra messages, so make sure Triton will print the messages. + EXPECTED_MSG="Unable to create S3 filesystem client. Check account credentials. Exception: '' Message: 'No response body.'" + if ! grep "$EXPECTED_MSG" $SERVER_LOG; then + echo -e "\n***\n*** Expected error message not found\n***" + cat $SERVER_LOG + RET=1 + fi +fi +# Restore keys for destroying buckets +export AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY_BACKUP + +# Destroy buckets +for BUCKET_SUFFIX in 1 2; do + awslocal $ENDPOINT_FLAG s3 rm s3://$BUCKET_NAME$BUCKET_SUFFIX --recursive --include "*" && \ + awslocal $ENDPOINT_FLAG s3 rb s3://$BUCKET_NAME$BUCKET_SUFFIX || true +done + +# Kill minio server +kill $MINIO_PID +wait $MINIO_PID + +# Test the S3 client will not advertise HTTP/2 +TEST_LOG="./http2_advertise_test.log" +python3 mock_s3_service.py > $TEST_LOG 2>&1 & +sleep 2 # make sure the mock service has started +SERVER_LOG="./http2_advertise_test.server.log" +SERVER_ARGS="--model-repository=s3://localhost:8080/dummy-bucket --exit-timeout-secs=120" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Unexpected server start $SERVER\n***" + cat $SERVER_LOG + kill $SERVER_PID + wait $SERVER_PID + RET=1 +else + sleep 2 # make sure the mock service has stopped + PASSED_MSG="TEST PASSED" + if ! grep "$PASSED_MSG" $TEST_LOG; then + echo -e "\n***\n*** S3 client HTTP/2 advertise test failed\n***" + cat $TEST_LOG + RET=1 + fi +fi + +# Print and return test result +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi +exit $RET diff --git a/qa/L0_storage_azure/test.sh b/qa/L0_storage_azure/test.sh new file mode 100755 index 0000000000..15f9c78bcc --- /dev/null +++ b/qa/L0_storage_azure/test.sh @@ -0,0 +1,297 @@ +#!/bin/bash +# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +if [ -z "$AZURE_STORAGE_ACCOUNT" ]; then + echo -e "azure storage account must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi + +if [ -z "$AZURE_STORAGE_KEY" ]; then + echo -e "azure storage key must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi + +ACCOUNT_NAME=$AZURE_STORAGE_ACCOUNT +ACCOUNT_KEY=$AZURE_STORAGE_KEY +export CUDA_VISIBLE_DEVICES=0 +CLIENT_LOG_BASE="./client" +INFER_TEST="../common/infer_test.py" +EXPECTED_NUM_TESTS="3" +timestamp=$(date +%s) +CONTAINER_NAME="tritonqatest${timestamp}" + +# container path (Point to the container when testing cloud storage) +AS_URL="as://${ACCOUNT_NAME}/${CONTAINER_NAME}" + +# Must use setuptools version before 58.0.0 due to https://github.com/Azure/azure-cli/issues/19468 +python -m pip install -U setuptools==57.5.0 + +# Can now install latest azure-cli (instead of 2.0.73) +python -m pip install azure-cli + +# create test container +az storage container create --name ${CONTAINER_NAME} --account-name ${ACCOUNT_NAME} --account-key ${ACCOUNT_KEY} +sleep 10 + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_TIMEOUT=420 +SERVER_LOG_BASE="./inference_server" +source ../common/util.sh + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG_BASE* +RET=0 + +# Used to control which backends are run in infer_test.py +BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch plan"} + +function run_unit_tests() { + BACKENDS=$BACKENDS python $INFER_TEST >$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi +} + +function setup_model_repo() { + # Construct model repository + rm -rf models && mkdir -p models + for FW in $BACKENDS; do + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/${FW}_float32_float32_float32 models/ + done + + # Copy models with string inputs and remove nobatch (bs=1) models + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/*_object_object_object models/ + rm -rf models/*nobatch* +} + +setup_model_repo +KIND="KIND_GPU" +for FW in $BACKENDS; do + for MC in `ls models/${FW}*/config.pbtxt`; do + echo "instance_group [ { kind: ${KIND} }]" >> $MC + done +done + +# now traverse the tree and create empty version directories that the CLI skips +for dir in `ls models/`; do + for subdir in `ls models/$dir`; do + if [ -d models/$dir/$subdir ] && [ -z "$(ls models/$dir/$subdir)" ]; then + touch models/$dir/$subdir/$subdir + fi + done +done + +# copy contents of models into container. +for file in `find models -type f` ;do + az storage blob upload --container-name ${CONTAINER_NAME} --account-name ${ACCOUNT_NAME} --account-key ${ACCOUNT_KEY} --file $file --name $file +done +sleep 10 + +# Test 1 Scenarios: +# 1. access blob using shared key in envs +# 2. adding more scenarios in future +for ENV_VAR in "shared_key"; do + SERVER_LOG=$SERVER_LOG_BASE.$ENV_VAR.log + CLIENT_LOG=$CLIENT_LOG_BASE.$ENV_VAR.log + MODEL_REPO="${AS_URL}/models" + if [ "$ENV_VAR" == "sas" ]; then + unset AZURE_STORAGE_KEY + sas=`az storage blob generate-sas --container-name ${CONTAINER_NAME} --account-name ${ACCOUNT_NAME} --account-key ${ACCOUNT_KEY} --name models` + sas_without_quote=$(eval echo $sas) + export AZURE_STORAGE_SAS="?$sas_without_quote" + fi + + # Now start model tests + # set server arguments + SERVER_ARGS="--model-repository=$MODEL_REPO --exit-timeout-secs=120" + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + break + fi + + set +e + run_unit_tests + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test localization to a specified location +export TRITON_AZURE_MOUNT_DIRECTORY=`pwd`/azure_localization_test + +if [ -d "$TRITON_AZURE_MOUNT_DIRECTORY" ]; then + rm -rf $TRITON_AZURE_MOUNT_DIRECTORY +fi + +mkdir -p $TRITON_AZURE_MOUNT_DIRECTORY + +SERVER_LOG=$SERVER_LOG_BASE.custom_localization.log +SERVER_ARGS="--model-repository=$MODEL_REPO --exit-timeout-secs=120" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +if [ -z "$(ls -A $TRITON_AZURE_MOUNT_DIRECTORY)" ]; then + echo -e "\n***\n*** Test localization to a specified location failed. \n***" + echo -e "\n***\n*** Specified mount folder $TRITON_AZURE_MOUNT_DIRECTORY is empty \n***" + ls -A $TRITON_AZURE_MOUNT_DIRECTORY + exit 1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +if [ -d "$TRITON_AZURE_MOUNT_DIRECTORY" ] && [ ! -z "$(ls -A $TRITON_AZURE_MOUNT_DIRECTORY)" ]; then + echo -e "\n***\n*** Test localization to a specified location failed. \n***" + echo -e "\n***\n*** Specified mount folder $TRITON_AZURE_MOUNT_DIRECTORY was not cleared properly. \n***" + ls -A $TRITON_AZURE_MOUNT_DIRECTORY + exit 1 +fi + +rm -rf $TRITON_AZURE_MOUNT_DIRECTORY +unset TRITON_AZURE_MOUNT_DIRECTORY + +# Add test for explicit model control +SERVER_LOG=$SERVER_LOG_BASE.explicit.log +CLIENT_LOG=$CLIENT_LOG_BASE.explicit.log +SERVER_ARGS="--model-repository=${AS_URL}/models --model-control-mode=explicit" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + RET=1 + break +fi + +set +e +for model in `ls models/`; do + code=`curl -s -w %{http_code} -X POST localhost:8000/v2/repository/models/${model}/load` + if [ "$code" != "200" ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi +done + +# Check that each explicitly loaded model runs correctly +run_unit_tests +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Clean up container +az storage container delete --name ${CONTAINER_NAME} --account-name ${ACCOUNT_NAME} --account-key ${ACCOUNT_KEY} +sleep 60 + +# Test with Polling, no model configuration file - with strict model config disabled +SERVER_LOG=$SERVER_LOG_BASE.noconfig.log +CLIENT_LOG=$CLIENT_LOG_BASE.noconfig.log +SERVER_ARGS="--model-repository=${AS_URL}/models --model-control-mode=poll --strict-model-config=false" + +# create test container +az storage container create --name ${CONTAINER_NAME} --account-name ${ACCOUNT_NAME} --account-key ${ACCOUNT_KEY} +sleep 10 + +# Setup model repository with minimal configs to be autocompleted +rm -rf models && mkdir -p models +AUTOCOMPLETE_BACKENDS="savedmodel" +for FW in ${AUTOCOMPLETE_BACKENDS}; do + for model in ${FW}_float32_float32_float32 ${FW}_object_object_object; do + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/${model} models/ + # Config files specify things expected by unit test like label_filename + # and max_batch_size for comparing results, so remove some key fields + # for autocomplete to fill that won't break the unit test. + sed -i '/platform:/d' models/${model}/config.pbtxt + sed -i '/data_type:/d' models/${model}/config.pbtxt + sed -i '/dims:/d' models/${model}/config.pbtxt + done +done + +# copy contents of models into container. +for file in `find models -type f` ;do + az storage blob upload --container-name ${CONTAINER_NAME} --account-name ${ACCOUNT_NAME} --account-key ${ACCOUNT_KEY} --file $file --name $file +done +sleep 10 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +# Check that each polled model runs correctly +export BACKENDS="${AUTOCOMPLETE_BACKENDS}" +run_unit_tests +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Clean up container +az storage container delete --name ${CONTAINER_NAME} --account-name ${ACCOUNT_NAME} --account-key ${ACCOUNT_KEY} + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_storage_swiftstack/infer_test.py b/qa/L0_storage_swiftstack/infer_test.py new file mode 100755 index 0000000000..f8a65a01a4 --- /dev/null +++ b/qa/L0_storage_swiftstack/infer_test.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + + +class InferTest(tu.TestResultCollector): + def _full_exact( + self, input_dtype, output0_dtype, output1_dtype, output0_raw, output1_raw, swap + ): + def _infer_exact_helper( + tester, + pf, + tensor_shape, + batch_size, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=True, + output1_raw=True, + model_version=None, + swap=False, + outputs=("OUTPUT0", "OUTPUT1"), + use_http=True, + use_grpc=True, + skip_request_id_check=False, + use_streaming=True, + correlation_id=0, + ): + for bs in (1, batch_size): + iu.infer_exact( + tester, + pf, + (bs,) + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + ) + + input_size = 16 + + if tu.validate_for_tf_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + for pf in ["graphdef", "savedmodel"]: + _infer_exact_helper( + self, + pf, + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_trt_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size, 1, 1), + (input_size, 1, 1), + (input_size, 1, 1), + ): + if input_dtype == np.int8: + _infer_exact_helper( + self, + "plan", + (input_size, 1, 1), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + else: + _infer_exact_helper( + self, + "plan", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_onnx_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + _infer_exact_helper( + self, + "onnx", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_libtorch_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + _infer_exact_helper( + self, + "libtorch", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + def test_raw_fff(self): + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=True, + ) + + def test_class_fff(self): + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + swap=True, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_storage_swiftstack/test.sh b/qa/L0_storage_swiftstack/test.sh new file mode 100755 index 0000000000..99fb5610d6 --- /dev/null +++ b/qa/L0_storage_swiftstack/test.sh @@ -0,0 +1,198 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +TEST_RESULT_FILE='test_results.txt' +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +unset AWS_ACCESS_KEY_ID +unset AWS_SECRET_ACCESS_KEY +unset AWS_DEFAULT_REGION + +pip3 install --no-deps awscli-plugin-endpoint + +# cli_legacy_plugin_path = /usr/local/lib/python3.8/site-packages + +mkdir -p ~/.aws +# Swiftstack S3 credentials are necessary for this test. Passed via ENV variables +echo "[plugins] +endpoint = awscli_plugin_endpoint + +[default] +aws_access_key_id = $SWIFTSTACK_ACCESS_KEY_ID +aws_secret_access_key = $SWIFTSTACK_SECRET_ACCESS_KEY +region = $SWIFTSTACK_DEFAULT_REGION + +s3 = + endpoint_url = https://pbss.s8k.io + signature_version = s3v4 + payload_signing_enabled = true +" > ~/.aws/config + +export AWS_ACCESS_KEY_ID=$SWIFTSTACK_ACCESS_KEY_ID && +export AWS_SECRET_ACCESS_KEY=$SWIFTSTACK_SECRET_ACCESS_KEY && +export AWS_DEFAULT_REGION=$SWIFTSTACK_DEFAULT_REGION + +# S3 bucket path (Point to bucket when testing cloud storage) +BUCKET_URL="s3://triton-bucket-${CI_JOB_ID}" + +# S3 repo path to pass to Triton server +S3_REPO_URL="s3://https://pbss.s8k.io:443/triton-bucket-${CI_JOB_ID}" + +# Cleanup S3 test bucket if exists (due to test failure) +aws s3 rm $BUCKET_URL --recursive --include "*" && \ + aws s3 rb $BUCKET_URL || true + +# Make S3 test bucket +aws s3 mb $BUCKET_URL + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_TIMEOUT=420 + +CLIENT_LOG_BASE="./client" +SERVER_LOG_BASE="./inference_server" +INFER_TEST=infer_test.py +EXPECTED_NUM_TESTS="2" +source ../common/util.sh + +rm -f $SERVER_LOG_BASE* $CLIENT_LOG_BASE* +RET=0 + +SERVER_LOG=$SERVER_LOG_BASE.log +CLIENT_LOG=$CLIENT_LOG_BASE.log + +# Copy models in model directory +rm -rf models && mkdir -p models + +aws s3 rm $BUCKET_URL/ --recursive --include "*" + +# Now start model tests + +for FW in graphdef savedmodel onnx libtorch plan; do + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/${FW}_float32_float32_float32/ models/ +done + +for FW in graphdef savedmodel onnx libtorch plan; do + for MC in `ls models/${FW}*/config.pbtxt`; do + echo "instance_group [ { kind: KIND_GPU }]" >> $MC + done +done + +# copy contents of /models into S3 bucket. +aws s3 cp models/ $BUCKET_URL/ --recursive --include "*" + +# Test without polling +SERVER_ARGS="--model-repository=$S3_REPO_URL --exit-timeout-secs=120" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $INFER_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Clean up bucket contents +aws s3 rm $BUCKET_URL/ --recursive --include "*" + + +# Test with polling enabled +SERVER_ARGS="--model-repository=$S3_REPO_URL --exit-timeout-secs=120 --model-control-mode=poll" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# copy contents of /models into S3 bucket and wait for them to be loaded. +aws s3 cp models/ $BUCKET_URL/ --recursive --include "*" +sleep 420 + +set +e + +python $INFER_TEST >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Clean up bucket contents and delete bucket +aws s3 rm $BUCKET_URL/ --recursive --include "*" +aws s3 rb $BUCKET_URL + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_string_io/string_client_test.py b/qa/L0_string_io/string_client_test.py new file mode 100755 index 0000000000..16112ac70c --- /dev/null +++ b/qa/L0_string_io/string_client_test.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest +from builtins import range + +import numpy as np +import test_util as tu +import tritonclient.grpc as tritongrpcclient +import tritonclient.http as tritonhttpclient +import tritonclient.utils as tritonutils + + +class ClientStringTest(tu.TestResultCollector): + def _test_infer_unicode(self, model_name, client, input_): + # Send inference request to the inference server. Get results for + # both output tensors. + inputs = [] + outputs = [] + inputs.append(client[1].InferInput("INPUT0", input_.shape, "BYTES")) + + if client[1] == tritonhttpclient: + inputs[0].set_data_from_numpy(input_, client[3]) + else: + inputs[0].set_data_from_numpy(input_) + + if client[1] == tritonhttpclient: + outputs.append( + client[1].InferRequestedOutput("OUTPUT0", binary_data=client[2]) + ) + else: + outputs.append(client[1].InferRequestedOutput("OUTPUT0")) + + results = client[0].infer(model_name=model_name, inputs=inputs, outputs=outputs) + + out0 = results.as_numpy("OUTPUT0") + # We expect there to be 1 results (with batch-size 1). Verify + # that all 8 result elements are the same as the input. + self.assertTrue(np.array_equal(input_, out0)) + return out0 + + def _test_infer_non_unicode(self, model_name, client, input_, binary_data=True): + # Send inference request to the inference server. Get results for + # both output tensors. + inputs = [] + outputs = [] + inputs.append(client[1].InferInput("INPUT0", input_.shape, "BYTES")) + + if client[1] == tritonhttpclient: + inputs[0].set_data_from_numpy(input_, client[3]) + else: + inputs[0].set_data_from_numpy(input_) + + if client[1] == tritonhttpclient: + outputs.append( + client[1].InferRequestedOutput("OUTPUT0", binary_data=client[2]) + ) + else: + outputs.append(client[1].InferRequestedOutput("OUTPUT0")) + + results = client[0].infer(model_name=model_name, inputs=inputs, outputs=outputs) + + out0 = results.as_numpy("OUTPUT0") + # We expect there to be 1 results (with batch-size 1). Verify + # that all 8 result elements are the same as the input. + if client[2]: + self.assertTrue(np.array_equal(input_.astype(np.bytes_), out0)) + else: + self.assertTrue( + np.array_equal(input_.astype(np.bytes_), out0.astype(np.bytes_)) + ) + return out0 + + def _test_unicode_bytes_dtype(self, client, model_name, dtype="|S78"): + # Create the data for the input tensor. Initialize the tensor to 8 + # byte strings. (dtype of np.bytes_) + # Sample string that should no longer cause failure + in0 = np.array( + [ + [ + b"\nF\n'\n\x01a\x12\"\x1a \n\x1e\xfa\x03\x94\x01\x0f\xd7\x02\xf1\x05\xdf\x01\x82\x03\xb5\x05\xc1\x07\xba\x06\xff\x06\xc7\x07L\xf5\x03\xe2\x07\xa9\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xdf\\\xcb\xbf" + ], + [ + b"\n:\n\x1a\n\x01a\x12\x15\x1a\x13\n\x11*\xe3\x05\xc5\x06\xda\x07\xcb\x06~\xb1\x05\xb3\x01\xa9\x02\x15\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbb[\n\xbf" + ], + [ + b"\nL\n-\n\x01a\x12(\x1a&\n$\x87\x07\xce\x01\xe7\x06\xee\x04\xe1\x03\xf1\x03\xd7\x07\xbe\x02\xb8\x05\xe0\x05\xe4\x01\x88\x06\xb6\x03\xb9\x05\x83\x06\xf8\x04\xe2\x04\xf4\x06\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbc\x99+@" + ], + [ + b"\n2\n\x12\n\x01a\x12\r\x1a\x0b\n\t\x99\x02\xde\x04\x9f\x04\xc5\x053\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x12\x07\x83\xbe" + ], + [ + b"\nJ\n\r\n\x01b\x12\x08\x1a\x06\n\x04\x9b\x94\xad\x04\n\r\n\x01c\x12\x08\x12\x06\n\x04\xc3\x8a\x08\xbf\n*\n\x01a\x12%\x1a#\n!\x9c\x02\xb2\x02\xcd\x02\x9d\x07\x8d\x01\xb6\x05a\xf1\x01\xf0\x05\xdb\x02\xac\x04\xbd\x05\xe0\x04\xd2\x06\xaf\x02\xa8\x01\x8b\x04" + ], + [ + b"\n3\n\x13\n\x01a\x12\x0e\x1a\x0c\n\n<\xe2\x05\x8a\x01\xb3\x07?\xfd\x01\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x1b\x931\xbf\x00\x00" + ], + [ + b"\n&\n\x07\n\x01a\x12\x02\x1a\x00\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04{\xbc\x0e>\x00\x00\x00" + ], + [ + b"\nF\n'\n\x01a\x12\"\x1a \n\x1e\x97\x01\x93\x02\x9e\x01\xac\x06\xff\x01\xd8\x05\xe1\x07\xd8\x04g]\x9a\x05\xff\x06\xde\x07\x8f\x04\x97\x04\xda\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x9a\xb7I\n\r\n\x01c\x12\x08\x12\x06\n\x04\xfb\x87\x83\xbf" + ], + ], + dtype=dtype, + ).flatten() + self._test_infer_unicode(model_name, client, in0) + + def _test_str_dtype(self, client, model_name, dtype=np.object_): + in0_bytes = np.array([str(i) for i in range(10000, 10008)], dtype=dtype) + self._test_infer_non_unicode(model_name, client, in0_bytes) + + in0_bytes = np.array([i for i in range(10000, 10008)], dtype=dtype) + self._test_infer_non_unicode(model_name, client, in0_bytes) + + def _test_bytes(self, model_name): + dtypes = [np.object_, np.bytes_] + + # This clients will fail for binary_data=False when the binary input + # is not UTF-8 encodable. They should work for other cases however. + binary_false_clients = [ + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + True, + False, + ), + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + False, + False, + ), + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + False, + True, + ), + ] + + # These clients work for every data type + other_clients = [ + ( + tritongrpcclient.InferenceServerClient("localhost:8001", verbose=True), + tritongrpcclient, + False, + ), + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + True, + True, + ), + ] + + for client in other_clients + binary_false_clients: + self._test_str_dtype(client, model_name) + for dtype in dtypes: + self._test_str_dtype(client, model_name, dtype) + + for client in other_clients: + self._test_unicode_bytes_dtype(client, model_name) + for dtype in dtypes: + self._test_unicode_bytes_dtype(client, model_name, dtype) + + for client in binary_false_clients: + with self.assertRaises(tritonutils.InferenceServerException): + self._test_unicode_bytes_dtype(client, model_name) + for dtype in dtypes: + with self.assertRaises(tritonutils.InferenceServerException): + self._test_unicode_bytes_dtype(client, model_name, dtype) + + def test_tf_unicode_bytes(self): + self._test_bytes("graphdef_nobatch_zero_1_object") + self._test_bytes("string_identity") + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_string_io/test.sh b/qa/L0_string_io/test.sh new file mode 100755 index 0000000000..eb45d43ba2 --- /dev/null +++ b/qa/L0_string_io/test.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +STRING_CLIENT_TEST_PY=string_client_test.py +EXPECTED_NUM_TESTS="1" + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f $CLIENT_LOG $SERVER_LOG +rm -fr models && mkdir models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/graphdef_nobatch_zero_1_object models/. +cp -r ../python_models/string_identity models/. +mkdir models/string_identity/1/ +mv models/string_identity/model.py models/string_identity/1/model.py + +(cd models/string_identity && \ + sed -i "s/\[ 1 \]/\[ 8 \]/" config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +set +e + +python $STRING_CLIENT_TEST_PY -v >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_tf_gpu_io/test.sh b/qa/L0_tf_gpu_io/test.sh new file mode 100755 index 0000000000..98a5dff1ef --- /dev/null +++ b/qa/L0_tf_gpu_io/test.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TF_TEST=tf_gpu_io_test.py +BACKENDS=${BACKENDS:="graphdef savedmodel"} + +DATADIR=/data/inferenceserver/${REPO_VERSION} + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +RET=0 +rm -f ./*.log + +# Test with qa identity TF models +for BACKEND in $BACKENDS; do + MODEL_NAME=${BACKEND}_zero_1_float32 + rm -fr models && mkdir -p models + cp -r $DATADIR/qa_identity_model_repository/${MODEL_NAME} \ + models/${MODEL_NAME}_def && \ + (cd models/${MODEL_NAME}_def && \ + sed -i 's/_zero_1_float32/&_def/' config.pbtxt) && \ + # Enable GPU I/O for TensorFlow model + cp -r models/${MODEL_NAME}_def models/${MODEL_NAME}_gpu && \ + (cd models/${MODEL_NAME}_gpu && \ + sed -i 's/_zero_1_float32_def/_zero_1_float32_gpu/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"gpu_io\"} ] } }" >> config.pbtxt) + + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" + SERVER_LOG="${MODEL_NAME}.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + python $TF_TEST TfGpuIoTest.test_${MODEL_NAME}_def >> ${BACKEND}.sanity.log 2>&1 + if (( $? != 0 )); then + cat ${BACKEND}.sanity.log + RET=1 + fi + + grep "is GPU tensor: true" $SERVER_LOG >> grep.out.log + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected neither input or output is GPU tensor\n***" + RET=1 + fi + + python $TF_TEST TfGpuIoTest.test_${MODEL_NAME}_gpu >> ${BACKEND}.gpu.sanity.log 2>&1 + if (( $? != 0 )); then + cat ${BACKEND}.gpu.sanity.log + RET=1 + fi + + grep "is GPU tensor: true" $SERVER_LOG >> grep.out.log + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected input and output are GPU tensors\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test savedmodel with mismatched key and name +rm -rf models && mkdir -p models +cp -r $DATADIR/qa_tf_tag_sigdef_repository/sig_tag0 models +(cd models/sig_tag0 && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"gpu_io\"} ] } }" >> config.pbtxt) + +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +SERVER_LOG="sig_tag0.server.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +CLIENT_LOG="sig_tag0.gpu.log" +python $TF_TEST TfGpuIoTest.test_sig_tag0 >> $CLIENT_LOG 2>&1 +if (( $? != 0 )); then + cat $CLIENT_LOG + RET=1 +fi +grep "is GPU tensor: true" $SERVER_LOG >> grep.out.log +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected input and output are GPU tensors\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_tf_gpu_io/tf_gpu_io_test.py b/qa/L0_tf_gpu_io/tf_gpu_io_test.py new file mode 100755 index 0000000000..fd3550e434 --- /dev/null +++ b/qa/L0_tf_gpu_io/tf_gpu_io_test.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + +TENSOR_SIZE = 16384 + + +class TfGpuIoTest(tu.TestResultCollector): + def _test_helper( + self, + model_name, + shape, + override_input_names=[], + override_output_names=[], + batching_enabled=False, + ): + try: + bs = 1 + if batching_enabled: + shape = [ + [ + bs, + ] + + shape + ] + iu.infer_zero( + self, + "graphdef", + bs, + np.float32, + shape, + shape, + override_model_name=model_name, + override_input_names=override_input_names, + override_output_names=override_output_names, + ) + + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_sig_tag0(self): + self._test_helper( + "sig_tag0", + [16], + override_input_names=["INPUT"], + override_output_names=["OUTPUT"], + ) + + def test_graphdef_zero_1_float32_def(self): + self._test_helper( + "graphdef_zero_1_float32_def", [TENSOR_SIZE], batching_enabled=True + ) + + def test_graphdef_zero_1_float32_gpu(self): + self._test_helper( + "graphdef_zero_1_float32_gpu", [TENSOR_SIZE], batching_enabled=True + ) + + def test_savedmodel_zero_1_float32_def(self): + self._test_helper( + "savedmodel_zero_1_float32_def", [TENSOR_SIZE], batching_enabled=True + ) + + def test_savedmodel_zero_1_float32_gpu(self): + self._test_helper( + "savedmodel_zero_1_float32_gpu", [TENSOR_SIZE], batching_enabled=True + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_tf_parameters/test.sh b/qa/L0_tf_parameters/test.sh new file mode 100755 index 0000000000..133b6ef68d --- /dev/null +++ b/qa/L0_tf_parameters/test.sh @@ -0,0 +1,150 @@ +#!/bin/bash +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi +source ../common/util.sh + +export CUDA_VISIBLE_DEVICES=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_tf_parameters_repository +TEST_RESULT_FILE='test_results.txt' +CLIENT_LOG="./client.log" +TEST=tf_parameter_test.py +EXPECTED_NUM_TESTS="1" +MODEL_REPOSITORY=`pwd`/models +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_LOG="./inference_server.log" + +RET=0 + +rm -rf $SERVER_LOG $CLIENT_LOG models/ +cp -r $DATADIR models +SERVER_ARGS="--model-repository=$MODEL_REPOSITORY" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST TFParameterTest.test_tf_variable_error>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Add the initialization operation +echo "{\"init_ops\": [\"init\"]}" > models/graphdef_variable/init_ops.json +echo "parameters: { key: \"TF_INIT_OPS_FILE\" value: { string_value:\"init_ops.json\" }}" >> models/graphdef_variable/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST TFParameterTest.test_tf_variable>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Move the initialization op to the model version folder. +mv models/graphdef_variable/init_ops.json models/graphdef_variable/1/ + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST TFParameterTest.test_tf_variable>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_tf_parameters/tf_parameter_test.py b/qa/L0_tf_parameters/tf_parameter_test.py new file mode 100755 index 0000000000..f1a4621d93 --- /dev/null +++ b/qa/L0_tf_parameters/tf_parameter_test.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.http as tritonhttpclient +import tritonclient.utils + + +class TFParameterTest(tu.TestResultCollector): + def setUp(self): + self._client = tritonhttpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + + def _infer_helper(self): + # The model has a single variable which is added to the input. Since the + # variable is initialized to zero the input and output must match. + model_name = "graphdef_variable" + input = np.array([10], dtype=np.int32) + + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", input.shape, "INT32")) + inputs[-1].set_data_from_numpy(input) + + outputs = [] + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT")) + + results = self._client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + output = results.as_numpy("OUTPUT") + np.testing.assert_array_equal(output, input) + + def test_tf_variable(self): + self._infer_helper() + + def test_tf_variable_error(self): + with self.assertRaises(tritonclient.utils.InferenceServerException) as e: + self._infer_helper() + self.assertIn( + "FAILED_PRECONDITION: Could not find variable VARIABLE. This " + + "could mean that the variable has been deleted. In TF1, it can " + + "also mean the variable is uninitialized.", + e.exception.message(), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_tf_tag_sigdef/test.sh b/qa/L0_tf_tag_sigdef/test.sh new file mode 100755 index 0000000000..32248c74ad --- /dev/null +++ b/qa/L0_tf_tag_sigdef/test.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +CLIENT_LOG="./client.log" +TEST=tf_tag_sigdef_test.py + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_tf_tag_sigdef_repository +MODELDIR=`pwd`/models + +rm -rf $SERVER_LOG $CLIENT_LOG $MODELDIR +mkdir $MODELDIR +cp -r $DATADIR/* $MODELDIR + +EXPECTED_NUM_TESTS="4" +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$MODELDIR --exit-timeout-secs=120" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + if [ `grep -c "configuration expects 2 inputs, model provides 1" $SERVER_LOG` != "0" ]; then + echo -e "*** FAILED: sig_tag_different_io config autocompleted with wrong model tag variant, failed to load.\n" + RET=1 + fi + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TEST>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py b/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py new file mode 100755 index 0000000000..b4a11ac04e --- /dev/null +++ b/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonhttpclient as httpclient + + +class TagSigdefTest(tu.TestResultCollector): + base_model_name = "sig_tag" + base_tag = "serve" + test_tag = "testTag" + base_sig_def = "serving_default" + test_sig_def = "testSigDef" + dims = 16 + + def _test_helper(self, modelVersion, tag, sig_def): + shape = [self.dims] + model_name = self.base_model_name + str(modelVersion) + # The multiplier is defined during model creation. See server/qa/common/gen_tag_sigdef.py + # for details + multiplier = modelVersion + 1 + output_name = "OUTPUT" + triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT", shape, "FP32")) + input_data = np.ones(shape=shape).astype(np.float32) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + + outputs.append(httpclient.InferRequestedOutput(output_name, binary_data=True)) + results = triton_client.infer(model_name, inputs, outputs=outputs) + output_data = results.as_numpy(output_name) + test_output = input_data * multiplier + self.assertTrue(np.isclose(output_data, test_output).all()) + + def test_default(self): + self._test_helper(0, self.base_tag, self.base_sig_def) + + def test_sig_def(self): + self._test_helper(1, self.base_tag, self.test_sig_def) + + def test_tag(self): + self._test_helper(2, self.test_tag, self.base_sig_def) + + def test_tag_sig_def(self): + self._test_helper(3, self.test_tag, self.test_sig_def) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_tf_unknown_rank/test.sh b/qa/L0_tf_unknown_rank/test.sh new file mode 100755 index 0000000000..e279a46267 --- /dev/null +++ b/qa/L0_tf_unknown_rank/test.sh @@ -0,0 +1,127 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' +export CUDA_VISIBLE_DEVICES=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION} + +CLIENT_LOG="./client.log" +UNKNOWN_RANK_TEST=tf_unknown_rank_test.py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f ./*.log +rm -fr models && mkdir -p models +cp -r $DATADIR/tf_model_store2/unknown_rank_* models/ + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +set +e +python $UNKNOWN_RANK_TEST UnknownRankTest.test_success >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +python $UNKNOWN_RANK_TEST UnknownRankTest.test_wrong_input >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Try to load model with scalar tensor. The server should fail to load the model. +rm -rf scalar_repo; mkdir scalar_repo +cp -r $DATADIR/tf_model_store3/scalar_model scalar_repo/ +SERVER_ARGS="--model-repository=`pwd`/scalar_repo --strict-model-config=false" +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "*** FAILED: unexpected success starting $SERVER" >> $CLIENT_LOG + RET=1 + kill $SERVER_PID + wait $SERVER_PID +else + ERROR_MESSAGE="Unable to autofill for 'scalar_model': the rank of model tensor 'x' is 0 and dimensions are not defined" + if [[ $(cat $SERVER_LOG | grep "${ERROR_MESSAGE}" | wc -l) -ne 2 ]]; then + echo -e "\n***\n*** Test Failed: "${ERROR_MESSAGE}" not found\n***" + cat $SERVER_LOG + RET=1 + fi +fi + + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py b/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py new file mode 100755 index 0000000000..add6b32c13 --- /dev/null +++ b/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonhttpclient +from tritonclientutils import * + + +class UnknownRankTest(tu.TestResultCollector): + # helper function to generate requests to the server + def infer_unknown(self, model_name, tensor_shape): + print("About to run the test") + input_data = np.random.random_sample(tensor_shape).astype(np.float32) + client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [ + tritonhttpclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + results = client.infer(model_name, inputs) + self.assertTrue(np.array_equal(results.as_numpy("OUTPUT"), input_data)) + + def test_success(self): + model_name = "unknown_rank_success" + tensor_shape = 1 + try: + self.infer_unknown(model_name, tensor_shape) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_wrong_input(self): + model_name = "unknown_rank_wrong_output" + tensor_shape = (1, 2) + try: + self.infer_unknown(model_name, tensor_shape) + self.fail( + "Found success when expected failure with model given " + "wrong input tensor [1,2] for input [-1,1]." + ) + except InferenceServerException as ex: + self.assertIn( + "unexpected shape for input 'INPUT' for model " + "'unknown_rank_wrong_output'. Expected [1], got [1,2]", + ex.message(), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_tftrt_optimization/test.sh b/qa/L0_tftrt_optimization/test.sh new file mode 100755 index 0000000000..04dcdc2f65 --- /dev/null +++ b/qa/L0_tftrt_optimization/test.sh @@ -0,0 +1,212 @@ +#!/bin/bash +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +DATADIR=/data/inferenceserver/${REPO_VERSION} + +CLIENT_LOG="./client.log" +TFTRT_OPTIMIZATION_TEST=tftrt_optimization_test.py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1 --exit-on-error=false" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +RET=0 + +for MODEL in \ + graphdef_float32_float32_float32 \ + savedmodel_float32_float32_float32; do + rm -f ./*.log + rm -fr models && mkdir -p models + cp -r $DATADIR/qa_model_repository/${MODEL} \ + models/${MODEL}_def && \ + rm -fr models/${MODEL}_def/2 && \ + rm -fr models/${MODEL}_def/3 && \ + (cd models/${MODEL}_def && \ + sed -i 's/_float32_float32_float32/&_def/' config.pbtxt) && \ + # GPU execution accelerators with default setting + cp -r models/${MODEL}_def models/${MODEL}_trt && \ + (cd models/${MODEL}_trt && \ + sed -i 's/_float32_def/_float32_trt/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\"} ] } }" >> config.pbtxt) && \ + # GPU execution accelerators with correct parameters + cp -r models/${MODEL}_def models/${MODEL}_param && \ + (cd models/${MODEL}_param && \ + sed -i 's/_float32_def/_float32_param/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\" \ + parameters { key: \"precision_mode\" value: \"FP16\" } \ + parameters { key: \"minimum_segment_size\" value: \"1\" } }]}}" \ + >> config.pbtxt) && \ + # GPU execution accelerators with unknown parameters + cp -r models/${MODEL}_def models/${MODEL}_unknown_param && \ + (cd models/${MODEL}_unknown_param && \ + sed -i 's/_float32_def/_float32_unknown_param/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\" \ + parameters { key: \"precision_mode\" value: \"FP16\" } \ + parameters { key: \"segment_size\" value: \"1\" } }]}}" \ + >> config.pbtxt) && \ + # GPU execution accelerators with invalid parameters + cp -r models/${MODEL}_def models/${MODEL}_invalid_param && \ + (cd models/${MODEL}_invalid_param && \ + sed -i 's/_float32_def/_float32_invalid_param/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\" \ + parameters { key: \"precision_mode\" value: \"FP16\" } \ + parameters { key: \"max_workspace_size_bytes\" value: \"abc\" } }]}}" \ + >> config.pbtxt) && \ + # GPU execution accelerators on CPU context + cp -r models/${MODEL}_trt models/${MODEL}_cpu_trt && \ + (cd models/${MODEL}_cpu_trt && \ + sed -i 's/_float32_trt/_float32_cpu_trt/' \ + config.pbtxt && \ + echo "instance_group [ { kind: KIND_CPU }]" >> config.pbtxt) && \ + # CPU execution accelerators + cp -r models/${MODEL}_def models/${MODEL}_openvino && \ + (cd models/${MODEL}_openvino && \ + sed -i 's/_float32_def/_float32_openvino/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { cpu_execution_accelerator : [ { name : \"openvino\" } ] } }" >> config.pbtxt) && \ + # Unknown GPU execution accelerator + cp -r models/${MODEL}_def models/${MODEL}_unknown_gpu && \ + (cd models/${MODEL}_unknown_gpu && \ + sed -i 's/_float32_def/_float32_unknown_gpu/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"unknown_gpu\" } ] } }" >> config.pbtxt) && \ + # Unknown CPU execution accelerators + cp -r models/${MODEL}_def models/${MODEL}_unknown_cpu && \ + (cd models/${MODEL}_unknown_cpu && \ + sed -i 's/_float32_def/_float32_unknown_cpu/' \ + config.pbtxt && \ + echo "optimization { execution_accelerators { cpu_execution_accelerator : [ { name : \"unknown_cpu\" } ] } }" >> config.pbtxt) + + run_server_tolive + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + grep "TensorRT Execution Accelerator is set for ${MODEL}_trt" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected TensorRT Execution Accelerator is set\n***" + RET=1 + fi + + grep "TensorRT Execution Accelerator is set for ${MODEL}_param" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected TensorRT Execution Accelerator is set\n***" + RET=1 + fi + + grep "failed to load '${MODEL}_unknown_param' version 1: Invalid argument: unknown parameter 'segment_size' is provided for TensorRT Execution Accelerator" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected unknown parameter 'segment_size' returns error\n***" + RET=1 + fi + + grep "failed to load '${MODEL}_invalid_param' version 1: Invalid argument: failed to convert 'abc' to long long integral number" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected invalid parameter 'abc' returns error\n***" + RET=1 + fi + + grep "GPU Execution Accelerator will be ignored for model instance on CPU" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected logged warning: GPU Execution Accelerator will be ignored for model instance on CPU\n***" + RET=1 + fi + + grep "failed to load '${MODEL}_openvino' version 1: Invalid argument: CPU Execution Accelerator is not supported in TensorFlow backend" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected CPU Execution Accelerator returns error\n***" + RET=1 + fi + + grep "failed to load '${MODEL}_unknown_gpu' version 1: Invalid argument: unknown Execution Accelerator 'unknown_gpu' is requested" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'unknown_gpu' Execution Accelerator returns error\n***" + RET=1 + fi + grep "failed to load '${MODEL}_unknown_cpu' version 1: Invalid argument: CPU Execution Accelerator is not supported in TensorFlow backend" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'unknown_cpu' Execution Accelerator returns error\n***" + RET=1 + fi + + TEST_TYPE=test_graphdef && \ + [[ "$MODEL" == "savedmodel_float32_float32_float32" ]] && \ + TEST_TYPE=test_savedmodel + echo "Test: $MODEL" >>$CLIENT_LOG + python $TFTRT_OPTIMIZATION_TEST TFTRTOptimizationTest.$TEST_TYPE \ + >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_tftrt_optimization/tftrt_optimization_test.py b/qa/L0_tftrt_optimization/tftrt_optimization_test.py new file mode 100755 index 0000000000..9e59677317 --- /dev/null +++ b/qa/L0_tftrt_optimization/tftrt_optimization_test.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonhttpclient as httpclient + + +class TFTRTOptimizationTest(tu.TestResultCollector): + def setUp(self): + self.input0_ = np.arange(start=0, stop=16, dtype=np.float32).reshape(1, 16) + self.input1_ = np.ones(shape=16, dtype=np.float32).reshape(1, 16) + self.expected_output0_ = self.input0_ + self.input1_ + self.expected_output1_ = self.input0_ - self.input1_ + + def _addsub_infer(self, model_name): + triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) + + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32")) + + # Initialize the data + inputs[0].set_data_from_numpy(self.input0_, binary_data=True) + inputs[1].set_data_from_numpy(self.input1_, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=True)) + + results = triton_client.infer(model_name, inputs, outputs=outputs) + + output0_data = results.as_numpy("OUTPUT0") + output1_data = results.as_numpy("OUTPUT1") + + self.assertTrue( + np.array_equal(self.expected_output0_, output0_data), "incorrect sum" + ) + self.assertTrue( + np.array_equal(self.expected_output1_, output1_data), "incorrect difference" + ) + + def test_graphdef(self): + self._addsub_infer("graphdef_float32_float32_float32_trt") + self._addsub_infer("graphdef_float32_float32_float32_param") + + def test_savedmodel(self): + self._addsub_infer("savedmodel_float32_float32_float32_trt") + self._addsub_infer("savedmodel_float32_float32_float32_param") + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trace/models/input_all_required/1/model.py b/qa/L0_trace/models/input_all_required/1/model.py new file mode 100644 index 0000000000..8d51130d06 --- /dev/null +++ b/qa/L0_trace/models/input_all_required/1/model.py @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + # Less than collector timeout which is 10 + time.sleep(2) + responses = [] + for _ in requests: + # Include one of each specially parsed JSON value: nan, inf, and -inf + out_0 = np.array([1], dtype=np.float32) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + + return responses diff --git a/qa/L0_trace/models/input_all_required/config.pbtxt b/qa/L0_trace/models/input_all_required/config.pbtxt new file mode 100644 index 0000000000..1426af2b65 --- /dev/null +++ b/qa/L0_trace/models/input_all_required/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "input_all_required" +backend: "python" +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "INPUT2" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] \ No newline at end of file diff --git a/qa/L0_trace/opentelemetry_unittest.py b/qa/L0_trace/opentelemetry_unittest.py new file mode 100644 index 0000000000..34dc0bfd88 --- /dev/null +++ b/qa/L0_trace/opentelemetry_unittest.py @@ -0,0 +1,1054 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") +import concurrent.futures +import json +import queue +import re +import shutil +import subprocess +import time +import unittest +from functools import partial + +import numpy as np +import requests +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + +NO_PARENT_SPAN_ID = "" +COLLECTOR_TIMEOUT = 10 + + +def callback(user_data, result, error): + if error: + user_data.put(error) + else: + user_data.put(result) + + +def prepare_data(client, is_binary=True): + inputs = [] + dim = 16 + input_data = np.arange(dim, dtype=np.int32) + inputs.append(client.InferInput("INPUT0", [1, dim], "INT32")) + inputs.append(client.InferInput("INPUT1", [1, dim], "INT32")) + + # Initialize the data + input_data = np.expand_dims(input_data, axis=0) + + if is_binary: + inputs[0].set_data_from_numpy(input_data) + inputs[1].set_data_from_numpy(input_data) + else: + inputs[0].set_data_from_numpy(input_data, binary_data=is_binary) + inputs[1].set_data_from_numpy(input_data, binary_data=is_binary) + + return inputs + + +def send_bls_request(model_name="simple", headers=None): + with httpclient.InferenceServerClient("localhost:8000") as client: + inputs = prepare_data(httpclient) + inputs.append(httpclient.InferInput("MODEL_NAME", [1], "BYTES")) + inputs[-1].set_data_from_numpy(np.array([model_name], dtype=np.object_)) + client.infer("bls_simple", inputs, headers=headers) + + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + + +class OpenTelemetryTest(tu.TestResultCollector): + def setUp(self): + self.collector_subprocess = subprocess.Popen( + ["./otelcol", "--config", "./trace-config.yaml"] + ) + time.sleep(5) + self.filename = "collected_traces.json" + # This simulates OTel context being injected on client side. + # Format explained here: https://www.w3.org/TR/trace-context/#design-overview + # OTel code reference for extraction: + # https://github.com/open-telemetry/opentelemetry-cpp/blob/c4f39f2be8109fd1a3e047677c09cf47954b92db/api/include/opentelemetry/trace/propagation/http_trace_context.h#L165 + # Essentially, this is what will be injected to headers/metadata + # on the client side. Code reference: + # https://github.com/open-telemetry/opentelemetry-cpp/blob/c4f39f2be8109fd1a3e047677c09cf47954b92db/api/include/opentelemetry/trace/propagation/http_trace_context.h#L91 + # Format is: 00-traceId-spanId-traceFlags + # By simply adding this header during tests, we imitate + # that on client side OTel Propagator injected it to request. + self.client_headers = dict( + {"traceparent": "00-0af7651916cd43dd8448eb211c12666c-b7ad6b7169242424-01"} + ) + self.simple_model_name = "simple" + self.ensemble_model_name = "ensemble_add_sub_int32_int32_int32" + self.input_all_required_model_name = "input_all_required" + self.cancel_queue_model_name = "dynamic_batch" + self.bls_model_name = "bls_simple" + self.trace_context_model = "trace_context" + self.non_decoupled_model_name_ = "repeat_int32" + self.identity_model = "custom_identity_int32" + self.test_models = [ + self.simple_model_name, + self.ensemble_model_name, + self.bls_model_name, + self.non_decoupled_model_name_, + self.cancel_queue_model_name, + self.identity_model, + ] + self.root_span = "InferRequest" + self._user_data = UserData() + self._callback = partial(callback, self._user_data) + self._outputs = [] + self.input_data = { + "IN": np.array([1], dtype=np.int32), + "DELAY": np.array([0], dtype=np.uint32), + "WAIT": np.array([0], dtype=np.uint32), + } + + def tearDown(self): + self.collector_subprocess.kill() + self.collector_subprocess.wait() + time.sleep(5) + test_name = unittest.TestCase.id(self).split(".")[-1] + shutil.copyfile(self.filename, self.filename + "_" + test_name + ".log") + + def _get_inputs(self, batch_size): + shape = [batch_size, 8] + inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")] + inputs[0].set_data_from_numpy(np.ones(shape, dtype=np.float32)) + return inputs + + def _generate_callback_and_response_pair(self): + response = {"responded": False, "result": None, "error": None} + + def callback_queue(result, error): + response["responded"] = True + response["result"] = result + response["error"] = error + + return callback_queue, response + + def _parse_trace_log(self, trace_log): + """ + Helper function that parses file, containing collected traces. + + Args: + trace_log (str): Name of a file, containing all traces. + + Returns: + traces (List[dict]): List of json objects, representing each span. + """ + traces = [] + with open(trace_log) as f: + for json_obj in f: + entry = json.loads(json_obj) + traces.append(entry) + + return traces + + def _check_events(self, span_name, events, is_cancelled): + """ + Helper function that verifies passed events contain expected entries. + + Args: + span_name (str): name of a span. + events (List[str]): list of event names, collected for the span with the name `span_name`. + """ + root_events_http = [ + "HTTP_RECV_START", + "HTTP_RECV_END", + "INFER_RESPONSE_COMPLETE", + "HTTP_SEND_START", + "HTTP_SEND_END", + ] + root_events_grpc = [ + "GRPC_WAITREAD_START", + "GRPC_WAITREAD_END", + "INFER_RESPONSE_COMPLETE", + "GRPC_SEND_START", + "GRPC_SEND_END", + ] + cancel_root_events_http = [ + "HTTP_RECV_START", + "HTTP_RECV_END", + ] + cancel_root_events_grpc = [ + "GRPC_WAITREAD_START", + "GRPC_WAITREAD_END", + ] + request_events = ["REQUEST_START", "QUEUE_START", "REQUEST_END"] + compute_events = [ + "COMPUTE_START", + "COMPUTE_INPUT_END", + "COMPUTE_OUTPUT_START", + "COMPUTE_END", + ] + + if span_name == "compute": + # Check that all compute related events (and only them) + # are recorded in compute span + self.assertTrue(all(entry in events for entry in compute_events)) + self.assertFalse(all(entry in events for entry in request_events)) + self.assertFalse( + all(entry in events for entry in root_events_http + root_events_grpc) + ) + self.assertEquals(len(events), len(compute_events)) + + elif span_name == self.root_span: + # Check that root span has INFER_RESPONSE_COMPLETE, _RECV/_WAITREAD + # and _SEND events (and only them) + if is_cancelled == True: + root_events_http = cancel_root_events_http + root_events_grpc = cancel_root_events_grpc + + if "HTTP" in events: + self.assertTrue(all(entry in events for entry in root_events_http)) + self.assertFalse(all(entry in events for entry in root_events_grpc)) + self.assertEquals(len(events), len(root_events_http)) + + elif "GRPC" in events: + self.assertTrue(all(entry in events for entry in root_events_grpc)) + self.assertFalse(all(entry in events for entry in root_events_http)) + self.assertEquals(len(events), len(root_events_grpc)) + + if is_cancelled == False: + self.assertFalse(all(entry in events for entry in request_events)) + self.assertFalse(all(entry in events for entry in compute_events)) + + elif span_name in self.test_models: + if span_name == self.identity_model: + request_events.append("CUSTOM_SINGLE_ACTIVITY") + # Check that all request related events (and only them) + # are recorded in request span + self.assertTrue(all(entry in events for entry in request_events)) + self.assertFalse( + all(entry in events for entry in root_events_http + root_events_grpc) + ) + self.assertFalse(all(entry in events for entry in compute_events)) + self.assertEquals(len(events), len(request_events)) + + elif span_name.startswith("CUSTOM_ACTIVITY"): + custom_activity_events = [] + if len(span_name) > len("CUSTOM_ACTIVITY"): + custom_activity_events.append(str(span_name + "_START")) + custom_activity_events.append(str(span_name + "_END")) + # Check `custom_identity_int32` config file, + # parameter `single_activity_frequency` identifies + # which custom spans contain "CUSTOM_SINGLE_ACTIVITY" event + if int(span_name[-1]) % 3 == 0: + custom_activity_events.append("CUSTOM_SINGLE_ACTIVITY") + else: + custom_activity_events = [ + "CUSTOM_ACTIVITY_START", + "CUSTOM_ACTIVITY_END", + ] + + self.assertTrue( + all(entry in events for entry in custom_activity_events), + "Span " + span_name, + ) + self.assertEquals( + len(events), len(custom_activity_events), "Span " + span_name + ) + + def _test_resource_attributes(self, attributes): + """ + Helper function that verifies passed span attributes. + Currently only test 2 attributes, specified upon tritonserver start: + + --trace-config=opentelemetry,resource=test.key=test.value + and + --trace-config=opentelemetry,resource=service.name=test_triton + + Args: + attributes (List[dict]): list of attributes, collected for a span. + """ + expected_service_name = dict( + {"key": "service.name", "value": {"stringValue": "test_triton"}} + ) + expected_test_key_value = dict( + {"key": "test.key", "value": {"stringValue": "test.value"}} + ) + self.assertIn( + expected_service_name, + attributes, + "Expected entry: {}, was not found in the set of collected attributes: {}".format( + expected_service_name, attributes + ), + ) + self.assertIn( + expected_test_key_value, + attributes, + "Expected entry: {}, was not found in the set of collected attributes: {}".format( + expected_test_key_value, attributes + ), + ) + + def _verify_contents(self, spans, expected_counts, is_cancelled): + """ + Helper function that: + * iterates over `spans` and for every span it verifies that proper events are collected + * verifies that `spans` has expected number of total spans collected + * verifies that `spans` contains expected number different spans, + specified in `expected_counts` in the form: + span_name : #expected_number_of_entries + + Args: + spans (List[dict]): list of json objects, extracted from the trace and + containing span info. For this test `name` + and `events` are required. + expected_counts (dict): dictionary, containing expected spans in the form: + span_name : #expected_number_of_entries + is_cancelled (bool): boolean, is true if called by cancelled workflow + """ + + span_names = [] + for span in spans: + # Check that collected spans have proper events recorded + span_name = span["name"] + span_names.append(span_name) + span_events = span["events"] + event_names_only = [event["name"] for event in span_events] + self._check_events(span_name, event_names_only, is_cancelled) + + self.assertEqual( + len(span_names), + sum(expected_counts.values()), + "Unexpeced number of span names collected", + ) + for name, count in expected_counts.items(): + self.assertEqual( + span_names.count(name), + count, + "Unexpeced number of " + name + " spans collected", + ) + + def _verify_nesting(self, spans, expected_parent_span_dict): + """ + Helper function that checks parent-child relationships between + collected spans are the same as in `expected_parent_span_dict`. + + Args: + spans (List[dict]): list of json objects, extracted from the trace and + containing span info. For this test `name` + and `events` are required. + expected_parent_span_dict (dict): dictionary, containing expected + parents and children in the dictionary form: + (str) : (List[str]) + """ + seen_spans = {} + for span in spans: + cur_span = span["spanId"] + seen_spans[cur_span] = span["name"] + + parent_child_dict = {} + for span in spans: + cur_parent = span["parentSpanId"] + cur_span = span["name"] + if cur_parent in seen_spans.keys(): + parent_name = seen_spans[cur_parent] + if parent_name not in parent_child_dict: + parent_child_dict[parent_name] = [] + parent_child_dict[parent_name].append(cur_span) + + for key in parent_child_dict.keys(): + parent_child_dict[key].sort() + + self.assertDictEqual(parent_child_dict, expected_parent_span_dict) + + def _verify_headers_propagated_from_client_if_any(self, root_span, headers): + """ + Helper function that checks traceparent's ids, passed in clients + headers/metadata was picked up on the server side. + If `headers` are None, checks that `root_span` does not have + `parentSpanId` specified. + + Args: + root_span (List[dict]): a json objects, extracted from the trace and + containing root span info. For this test `traceID` + and `parentSpanId` are required. + expected_parent_span_dict (dict): dictionary, containing expected + parents and children in the dictionary form: + (str) : (List[str]) + """ + parent_span_id = NO_PARENT_SPAN_ID + + if headers != None: + parent_span_id = headers["traceparent"].split("-")[2] + parent_trace_id = headers["traceparent"].split("-")[1] + self.assertEqual( + root_span["traceId"], + parent_trace_id, + "Child and parent trace ids do not match! child's trace id = {} , expected trace id = {}".format( + root_span["traceId"], parent_trace_id + ), + ) + + self.assertEqual( + root_span["parentSpanId"], + parent_span_id, + "Child and parent span ids do not match! child's parentSpanId = {} , expected parentSpanId {}".format( + root_span["parentSpanId"], parent_span_id + ), + ) + + def _test_trace_cancel(self, is_queued): + # We want to capture a cancellation request traces WHILE the inference is in the COMPUTE stage. + # Because the model "input_all_required" has a delay/wait in the compute phase so the cancellation request can be send while the request is waiting in the compute phase. + # The idea here is to wait before we try and read the traces from the file. + time.sleep(2 * COLLECTOR_TIMEOUT) + traces = self._parse_trace_log(self.filename) + if is_queued == False: + expected_counts = dict( + {"compute": 1, self.input_all_required_model_name: 1, self.root_span: 1} + ) + else: + # Compute is expected to be 0 as cancelled in queue + expected_counts = dict( + {"compute": 0, self.cancel_queue_model_name: 1, self.root_span: 1} + ) + parsed_spans = traces[0]["resourceSpans"][0]["scopeSpans"][0]["spans"] + self._verify_contents(parsed_spans, expected_counts, is_cancelled=True) + + def _test_trace( + self, + headers, + expected_number_of_spans, + expected_counts, + expected_parent_span_dict, + ): + """ + Helper method that defines the general test scenario for a trace, + described as follows. + + 1. Parse trace log, exported by OTel collector in self.filename. + 2. For each test we re-start OTel collector, so trace log should + have only 1 trace. + 3. Test that reported resource attributes contain manually specified + at `tritonserver` start time. Currently only test 2 attributes, + specified upon tritonserver start: + + --trace-config=opentelemetry,resource=test.key=test.value + and + --trace-config=opentelemetry,resource=service.name=test_triton + 4. Verifies that every collected span, has expected contents + 5. Verifies parent - child span relationships + 6. Verifies that OTel context was propagated from client side + to server side through headers. For cases, when headers for + context propagation were not specified, checks that root_span has + no `parentSpanId` specified. + + Args: + headers (dict | None): dictionary, containing OTel headers, + specifying OTel context. + expected_number_of_spans (int): expected number of collected spans. + expected_counts(dict): dictionary, containing expected spans in the form: + span_name : #expected_number_of_entries + expected_parent_span_dict (dict): dictionary, containing expected + parents and children in the dictionary form: + (str) : (List[str]) + """ + time.sleep(COLLECTOR_TIMEOUT) + traces = self._parse_trace_log(self.filename) + expected_traces_number = 1 + self.assertEqual( + len(traces), + expected_traces_number, + "Unexpected number of traces collected. Expected {}, but got {}".format( + expected_traces_number, len(traces) + ), + ) + self._test_resource_attributes( + traces[0]["resourceSpans"][0]["resource"]["attributes"] + ) + + parsed_spans = traces[0]["resourceSpans"][0]["scopeSpans"][0]["spans"] + root_span = [ + entry for entry in parsed_spans if entry["name"] == "InferRequest" + ][0] + self.assertEqual(len(parsed_spans), expected_number_of_spans) + self._verify_contents(parsed_spans, expected_counts, is_cancelled=False) + self._verify_nesting(parsed_spans, expected_parent_span_dict) + self._verify_headers_propagated_from_client_if_any(root_span, headers) + + def _test_simple_trace(self, headers=None): + """ + Helper function, that specifies expected parameters to evaluate trace, + collected from running 1 inference request for `simple` model. + """ + expected_number_of_spans = 3 + expected_counts = dict( + {"compute": 1, self.simple_model_name: 1, self.root_span: 1} + ) + expected_parent_span_dict = dict( + {"InferRequest": ["simple"], "simple": ["compute"]} + ) + self._test_trace( + headers=headers, + expected_number_of_spans=expected_number_of_spans, + expected_counts=expected_counts, + expected_parent_span_dict=expected_parent_span_dict, + ) + + def _test_custom_identity_trace(self, headers=None): + """ + Helper function, that specifies expected parameters to evaluate trace, + collected from running 1 inference request for `custom_identity_int32` + model. + Number of custom spans defined by the identity backend. + `CUSTOM_ACTIVITY` span will always be there, + `CUSTOM_ACTIVITY` defined by `config.pbtxt parameters`. + """ + expected_number_of_spans = 10 + expected_counts = dict( + { + "compute": 1, + self.identity_model: 1, + self.root_span: 1, + "CUSTOM_ACTIVITY": 1, + "CUSTOM_ACTIVITY0": 1, + "CUSTOM_ACTIVITY1": 1, + "CUSTOM_ACTIVITY2": 1, + "CUSTOM_ACTIVITY3": 1, + "CUSTOM_ACTIVITY4": 1, + "CUSTOM_ACTIVITY5": 1, + } + ) + expected_parent_span_dict = dict( + { + "InferRequest": ["custom_identity_int32"], + "custom_identity_int32": [ + "CUSTOM_ACTIVITY", + "CUSTOM_ACTIVITY0", + "compute", + ], + "CUSTOM_ACTIVITY0": ["CUSTOM_ACTIVITY1"], + "CUSTOM_ACTIVITY1": ["CUSTOM_ACTIVITY2"], + "CUSTOM_ACTIVITY2": ["CUSTOM_ACTIVITY3"], + "CUSTOM_ACTIVITY3": ["CUSTOM_ACTIVITY4"], + "CUSTOM_ACTIVITY4": ["CUSTOM_ACTIVITY5"], + } + ) + self._test_trace( + headers=headers, + expected_number_of_spans=expected_number_of_spans, + expected_counts=expected_counts, + expected_parent_span_dict=expected_parent_span_dict, + ) + + def _test_non_decoupled_trace(self, headers=None): + """ + Helper function, that collects trace for non decoupled model and verifies it. + """ + expected_number_of_spans = 3 + expected_counts = dict( + {"compute": 1, self.non_decoupled_model_name_: 1, self.root_span: 1} + ) + expected_parent_span_dict = dict( + {"InferRequest": ["repeat_int32"], "repeat_int32": ["compute"]} + ) + self._test_trace( + headers=headers, + expected_number_of_spans=expected_number_of_spans, + expected_counts=expected_counts, + expected_parent_span_dict=expected_parent_span_dict, + ) + + def _test_bls_trace(self, headers=None): + """ + Helper function, that specifies expected parameters to evaluate trace, + collected from running 1 inference request for `bls_simple` model. + """ + expected_number_of_spans = 6 + expected_counts = dict( + { + "compute": 2, + self.simple_model_name: 1, + self.ensemble_model_name: 1, + self.bls_model_name: 1, + self.root_span: 1, + } + ) + expected_parent_span_dict = dict( + { + "InferRequest": ["bls_simple"], + "bls_simple": ["compute", "ensemble_add_sub_int32_int32_int32"], + "ensemble_add_sub_int32_int32_int32": ["simple"], + "simple": ["compute"], + } + ) + for key in expected_parent_span_dict.keys(): + expected_parent_span_dict[key].sort() + + self._test_trace( + headers=headers, + expected_number_of_spans=expected_number_of_spans, + expected_counts=expected_counts, + expected_parent_span_dict=expected_parent_span_dict, + ) + + def _test_ensemble_trace(self, headers=None): + """ + Helper function, that specifies expected parameters to evaluate trace, + collected from running 1 inference request for an + `ensemble_add_sub_int32_int32_int32` model. + """ + expected_number_of_spans = 4 + expected_counts = dict( + { + "compute": 1, + self.simple_model_name: 1, + self.ensemble_model_name: 1, + self.root_span: 1, + } + ) + expected_parent_span_dict = dict( + { + "InferRequest": ["ensemble_add_sub_int32_int32_int32"], + "ensemble_add_sub_int32_int32_int32": ["simple"], + "simple": ["compute"], + } + ) + for key in expected_parent_span_dict.keys(): + expected_parent_span_dict[key].sort() + + self._test_trace( + headers=headers, + expected_number_of_spans=expected_number_of_spans, + expected_counts=expected_counts, + expected_parent_span_dict=expected_parent_span_dict, + ) + + def test_http_trace_simple_model(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model and HTTP client. + """ + triton_client_http = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + inputs = prepare_data(httpclient) + triton_client_http.infer(self.simple_model_name, inputs) + + self._test_simple_trace() + + def test_http_trace_simple_model_context_propagation(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model, HTTP client and context propagation, + i.e. client specifies OTel headers, defined in `self.client_headers`. + """ + triton_client_http = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + inputs = prepare_data(httpclient) + triton_client_http.infer( + self.simple_model_name, inputs, headers=self.client_headers + ) + + self._test_simple_trace(headers=self.client_headers) + + def test_grpc_trace_simple_model(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model and GRPC client. + """ + triton_client_grpc = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + inputs = prepare_data(grpcclient) + triton_client_grpc.infer(self.simple_model_name, inputs) + + self._test_simple_trace() + + def test_grpc_trace_all_input_required_model_cancel(self): + """ + Tests trace, collected from executing one inference request and cancelling the request + for a model and GRPC client. Expects only 2 GRPC stage events + """ + triton_client_grpc = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + inputs = [] + inputs.append(grpcclient.InferInput("INPUT0", [1], "FP32")) + inputs[0].set_data_from_numpy(np.arange(1, dtype=np.float32)) + inputs.append(grpcclient.InferInput("INPUT1", [1], "FP32")) + inputs[1].set_data_from_numpy(np.arange(1, dtype=np.float32)) + inputs.append(grpcclient.InferInput("INPUT2", [1], "FP32")) + inputs[2].set_data_from_numpy(np.arange(1, dtype=np.float32)) + future = triton_client_grpc.async_infer( + model_name=self.input_all_required_model_name, + inputs=inputs, + callback=self._callback, + outputs=self._outputs, + ) + time.sleep(2) # ensure the inference has started + future.cancel() + time.sleep(0.1) # context switch + self._test_trace_cancel(is_queued=False) + + # Test queued requests on dynamic batch scheduler can be cancelled + def test_grpc_trace_model_cancel_in_queue(self): + """ + Tests trace, collected from executing one inference request and cancelling the request + for a model and GRPC client while the request is in queue. Expects 0 compute stage traces + """ + model_name = self.cancel_queue_model_name + triton_client_grpc = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + with concurrent.futures.ThreadPoolExecutor() as pool: + # Saturate the slots on the model + saturate_thread = pool.submit( + triton_client_grpc.infer, model_name, self._get_inputs(batch_size=1) + ) + time.sleep(2) # ensure the slots are filled + # The next request should be queued + callback, response = self._generate_callback_and_response_pair() + future = triton_client_grpc.async_infer( + model_name, self._get_inputs(batch_size=1), callback + ) + time.sleep(0.2) # ensure the request is queued + future.cancel() + # Join saturating thread + saturate_thread.result() + self._test_trace_cancel(is_queued=True) + + def test_non_decoupled(self): + """ + Tests trace, collected from executing one inference request of non decoupled model. + """ + inputs = [ + grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy( + self.input_data["IN"] + ), + grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( + self.input_data["DELAY"] + ), + grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( + self.input_data["WAIT"] + ), + ] + + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + # Expect the inference is successful + res = triton_client.infer( + model_name=self.non_decoupled_model_name_, inputs=inputs + ) + self._test_non_decoupled_trace() + self.assertEqual(1, res.as_numpy("OUT")[0]) + self.assertEqual(0, res.as_numpy("IDX")[0]) + + def test_grpc_trace_simple_model_context_propagation(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model, GRPC client and context propagation, + i.e. client specifies OTel headers, defined in `self.client_headers`. + """ + triton_client_grpc = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + inputs = prepare_data(grpcclient) + triton_client_grpc.infer( + self.simple_model_name, inputs, headers=self.client_headers + ) + + self._test_simple_trace(headers=self.client_headers) + + def test_streaming_grpc_trace_simple_model(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model and GRPC streaming client. + """ + triton_client_grpc = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + user_data = queue.Queue() + triton_client_grpc.start_stream(callback=partial(callback, user_data)) + + inputs = prepare_data(grpcclient) + triton_client_grpc.async_stream_infer(self.simple_model_name, inputs) + result = user_data.get() + self.assertIsNot(result, InferenceServerException) + triton_client_grpc.stop_stream() + + self._test_simple_trace() + + def test_streaming_grpc_trace_simple_model_context_propagation(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model, GRPC streaming client and context propagation, + i.e. client specifies OTel headers, defined in `self.client_headers`. + """ + triton_client_grpc = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + user_data = queue.Queue() + triton_client_grpc.start_stream( + callback=partial(callback, user_data), + headers=self.client_headers, + ) + + inputs = prepare_data(grpcclient) + triton_client_grpc.async_stream_infer(self.simple_model_name, inputs) + result = user_data.get() + self.assertIsNot(result, InferenceServerException) + triton_client_grpc.stop_stream() + + self._test_simple_trace(headers=self.client_headers) + + def test_http_trace_bls_model(self): + """ + Tests trace, collected from executing one inference request + for a `bls_simple` model and HTTP client. + """ + send_bls_request(model_name=self.ensemble_model_name) + + self._test_bls_trace() + + def test_http_trace_bls_model_context_propagation(self): + """ + Tests trace, collected from executing one inference request + for a `bls_simple` model, HTTP client and context propagation, + i.e. client specifies OTel headers, defined in `self.client_headers`. + """ + send_bls_request( + model_name=self.ensemble_model_name, headers=self.client_headers + ) + + self._test_bls_trace(headers=self.client_headers) + + def test_http_trace_ensemble_model(self): + """ + Tests trace, collected from executing one inference request + for a `ensemble_add_sub_int32_int32_int32` model and HTTP client. + """ + triton_client_http = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + inputs = prepare_data(httpclient) + triton_client_http.infer(self.ensemble_model_name, inputs) + + self._test_ensemble_trace() + + def test_http_trace_ensemble_model_context_propagation(self): + """ + Tests trace, collected from executing one inference request + for a `ensemble_add_sub_int32_int32_int32` model, HTTP client + and context propagation, i.e. client specifies OTel headers, + defined in `self.client_headers`. + """ + triton_client_http = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + inputs = prepare_data(httpclient) + triton_client_http.infer( + self.ensemble_model_name, inputs, headers=self.client_headers + ) + + self._test_ensemble_trace(headers=self.client_headers) + + def test_http_trace_triggered(self): + triton_client_http = httpclient.InferenceServerClient("localhost:8000") + triton_client_http.update_trace_settings(settings={"trace_rate": "5"}) + + expected_trace_rate = "5" + simple_model_trace_settings = triton_client_http.get_trace_settings( + model_name=self.simple_model_name + ) + + self.assertEqual( + expected_trace_rate, + simple_model_trace_settings["trace_rate"], + "Unexpected model trace rate settings after its update. Expected {}, but got {}".format( + expected_trace_rate, simple_model_trace_settings["trace_rate"] + ), + ) + + inputs = prepare_data(httpclient) + for _ in range(5): + triton_client_http.infer(self.ensemble_model_name, inputs) + time.sleep(COLLECTOR_TIMEOUT) + + expected_accumulated_traces = 1 + traces = self._parse_trace_log(self.filename) + # Should only be 1 trace collected + self.assertEqual( + len(traces), + expected_accumulated_traces, + "Unexpected number of traces collected", + ) + + for _ in range(5): + triton_client_http.infer( + self.ensemble_model_name, inputs, headers=self.client_headers + ) + expected_accumulated_traces += 1 + time.sleep(COLLECTOR_TIMEOUT) + + traces = self._parse_trace_log(self.filename) + # Should only be 1 trace collected + self.assertEqual( + len(traces), + expected_accumulated_traces, + "Unexpected number of traces collected", + ) + + # Restore trace rate to 1 + triton_client_http.update_trace_settings(settings={"trace_rate": "1"}) + expected_trace_rate = "1" + simple_model_trace_settings = triton_client_http.get_trace_settings( + model_name=self.simple_model_name + ) + + self.assertEqual( + expected_trace_rate, + simple_model_trace_settings["trace_rate"], + "Unexpected model trace rate settings after its update. Expected {}, but got {}".format( + expected_trace_rate, simple_model_trace_settings["trace_rate"] + ), + ) + + def test_sagemaker_invocation_trace_simple_model_context_propagation(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model, SageMaker (invocations) and context propagation, + i.e. client specifies OTel headers, defined in `self.client_headers`. + """ + inputs = prepare_data(httpclient, is_binary=False) + request_body, _ = httpclient.InferenceServerClient.generate_request_body(inputs) + self.client_headers["Content-Type"] = "application/json" + r = requests.post( + "http://localhost:8080/invocations", + data=request_body, + headers=self.client_headers, + ) + r.raise_for_status() + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + self._test_simple_trace(headers=self.client_headers) + + def test_sagemaker_invoke_trace_simple_model_context_propagation(self): + """ + Tests trace, collected from executing one inference request + for a `simple` model, SageMaker (invoke) and context propagation, + i.e. client specifies OTel headers, defined in `self.client_headers`. + """ + # Loading model for this test + model_url = "/opt/ml/models/123456789abcdefghi/model" + request_body = {"model_name": self.simple_model_name, "url": model_url} + headers = {"Content-Type": "application/json"} + r = requests.post( + "http://localhost:8080/models", + data=json.dumps(request_body), + headers=headers, + ) + time.sleep(5) # wait for model to load + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + + inputs = prepare_data(httpclient, is_binary=False) + request_body, _ = httpclient.InferenceServerClient.generate_request_body(inputs) + + self.client_headers["Content-Type"] = "application/json" + invoke_url = "{}/{}/invoke".format( + "http://localhost:8080/models", self.simple_model_name + ) + r = requests.post(invoke_url, data=request_body, headers=self.client_headers) + r.raise_for_status() + self.assertEqual( + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) + time.sleep(5) + self._test_simple_trace(headers=self.client_headers) + + def test_trace_context_exposed_to_pbe(self): + """ + Tests trace context, propagated to python backend. + """ + triton_client_http = httpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + expect_none = np.array([False], dtype=bool) + inputs = httpclient.InferInput("expect_none", [1], "BOOL") + inputs.set_data_from_numpy(expect_none) + try: + result = triton_client_http.infer(self.trace_context_model, inputs=[inputs]) + except InferenceServerException as e: + self.fail(e.message()) + + context = result.as_numpy("OUTPUT0")[()].decode("utf-8") + context = json.loads(context) + self.assertIn("traceparent", context.keys()) + context_pattern = re.compile(r"\d{2}-[0-9a-f]{32}-[0-9a-f]{16}-\d{2}") + self.assertIsNotNone(re.match(context_pattern, context["traceparent"])) + + def test_custom_backend_tracing(self): + """ + Tests custom activities reported from identity backend. + """ + input0_ = np.array([[4]], dtype=np.int32) + with httpclient.InferenceServerClient("localhost:8000", verbose=True) as client: + inputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 1], "INT32")) + inputs[0].set_data_from_numpy(input0_) + client.infer(self.identity_model, inputs=inputs) + self._test_custom_identity_trace() + + def test_custom_backend_tracing_context_propagation(self): + """ + Tests custom activities reported from identity backend. + """ + input0_ = np.array([[4]], dtype=np.int32) + with httpclient.InferenceServerClient("localhost:8000", verbose=True) as client: + inputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 1], "INT32")) + inputs[0].set_data_from_numpy(input0_) + client.infer( + self.identity_model, inputs=inputs, headers=self.client_headers + ) + + self._test_custom_identity_trace(headers=self.client_headers) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh new file mode 100755 index 0000000000..d2943c1996 --- /dev/null +++ b/qa/L0_trace/test.sh @@ -0,0 +1,1241 @@ +#!/bin/bash +# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +SIMPLE_HTTP_CLIENT=../clients/simple_http_infer_client +SIMPLE_GRPC_CLIENT=../clients/simple_grpc_infer_client +TRACE_SUMMARY=../common/trace_summary.py + +CLIENT_TEST=trace_endpoint_test.py +CLIENT_LOG="client.log" +TEST_RESULT_FILE="test_results.txt" +EXPECTED_NUM_TESTS="6" + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +ENSEMBLEDIR=$DATADIR/../qa_ensemble_model_repository/qa_model_repository/ +BLSDIR=../python_models/bls_simple +CANCELDIR=models/ +MODELBASE=onnx_int32_int32_int32 + +MODELSDIR=`pwd`/trace_models + +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh +rm -f *.log +rm -f *.log.* +rm -fr $MODELSDIR && mkdir -p $MODELSDIR +# set up model for inference delay queueing +mkdir -p trace_models/dynamic_batch/1 && (cd trace_models/dynamic_batch && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 1' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'instance_group [{ count: 1 \n kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'dynamic_batching {' >> config.pbtxt && \ + echo -e ' preferred_batch_size: [ 1 ]' >> config.pbtxt && \ + echo -e ' default_queue_policy { timeout_action: REJECT \n default_timeout_microseconds: 1000000 \n max_queue_size: 8 }' >> config.pbtxt && \ + echo -e '}' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "8000" } }]' >> config.pbtxt) + +# set up simple and global_simple model using MODELBASE +cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \ + rm -r $MODELSDIR/simple/2 && rm -r $MODELSDIR/simple/3 && \ + (cd $MODELSDIR/simple && \ + sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt) && \ + cp -r $MODELSDIR/simple $MODELSDIR/global_simple && \ + (cd $MODELSDIR/global_simple && \ + sed -i "s/^name:.*/name: \"global_simple\"/" config.pbtxt) && \ + cp -r $ENSEMBLEDIR/simple_onnx_int32_int32_int32 $MODELSDIR/ensemble_add_sub_int32_int32_int32 && \ + # set up new dir for cancel model + cp -r $CANCELDIR/input_all_required $MODELSDIR/input_all_required && \ + rm -r $MODELSDIR/ensemble_add_sub_int32_int32_int32/2 && \ + rm -r $MODELSDIR/ensemble_add_sub_int32_int32_int32/3 && \ + (cd $MODELSDIR/ensemble_add_sub_int32_int32_int32 && \ + sed -i "s/^name:.*/name: \"ensemble_add_sub_int32_int32_int32\"/" config.pbtxt && \ + sed -i "s/model_name:.*/model_name: \"simple\"/" config.pbtxt) && \ + mkdir -p $MODELSDIR/bls_simple/1 && cp $BLSDIR/bls_simple.py $MODELSDIR/bls_simple/1/model.py + +# set up repeat_int32 model +cp -r ../L0_decoupled/models/repeat_int32 $MODELSDIR +sed -i "s/decoupled: True/decoupled: False/" $MODELSDIR/repeat_int32/config.pbtxt + +# set up identity model +mkdir -p $MODELSDIR/custom_identity_int32/1 && (cd $MODELSDIR/custom_identity_int32 && \ + echo 'name: "custom_identity_int32"' >> config.pbtxt && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 1024' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo 'instance_group [{ kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "500" } }, { key: "enable_custom_tracing" \n value: { string_value: "true" } }]' >> config.pbtxt) + +RET=0 + +# Helpers ======================================= +function assert_curl_success { + message="${1}" + if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***" + RET=1 + fi +} + +function assert_curl_failure { + message="${1}" + if [ "$code" == "200" ]; then + cat ./curl.out + echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***" + RET=1 + fi +} + +function get_global_trace_setting { + rm -f ./curl.out + set +e + code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/trace/setting` + set -e +} + +function get_trace_setting { + model_name="${1}" + rm -f ./curl.out + set +e + code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/models/${model_name}/trace/setting` + set -e +} + +function update_global_trace_setting { + settings="${1}" + rm -f ./curl.out + set +e + code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/trace/setting -d ${settings}` + set -e +} + +function update_trace_setting { + model_name="${1}" + settings="${2}" + rm -f ./curl.out + set +e + code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/models/${model_name}/trace/setting -d ${settings}` + set -e +} + +function check_pbe_trace_context { + model_name="${1}" + expect_none="${2}" + data='{"inputs":[{"name":"expect_none","datatype":"BOOL","shape":[1],"data":['${expect_none}']}]}' + rm -f ./curl.out + set +e + code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/models/${model_name}/infer -d ${data}` + set -e +} + +function send_inference_requests { + log_file="${1}" + upper_bound="${2}" + for (( p = 1; p <= $upper_bound; p++ )) do + $SIMPLE_HTTP_CLIENT >> ${log_file} 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + + $SIMPLE_GRPC_CLIENT >> ${log_file} 2>&1 + if [ $? -ne 0 ]; then + RET=1 + fi + done +} + +#======================================= + +# start with trace-level=OFF +SERVER_ARGS="--trace-config triton,file=trace_off_to_min.log --trace-config level=OFF --trace-config rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_off.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Enable via trace API and send again +update_global_trace_setting '{"trace_level":["TIMESTAMPS"]}' +assert_curl_success "Failed to modify global trace settings" + +# Check if the current setting is returned +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"trace_off_to_min.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +send_inference_requests "client_min.log" 10 + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +# Expect only the requests after calling trace API are traced +$TRACE_SUMMARY -t trace_off_to_min.log > summary_off_to_min.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_off_to_min.log` != "20" ]; then + cat summary_off_to_min.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_off_to_min.log` != "20" ]; then + cat summary_off_to_min.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# Add model specific setting +SERVER_ARGS="--trace-config triton,file=global_trace.log --trace-config level=TIMESTAMPS --trace-config rate=6 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_off.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Add trace setting for 'simple' via trace API, first use the same trace file +update_trace_setting "simple" '{"trace_file":"global_trace.log"}' +assert_curl_failure "trace_file updated through network protocol expects an error" + +# Check if the current setting is returned (not specified setting from global) +if [ `grep -c "\"error\":\"trace file location can not be updated through network protocol\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +# Use a different name +update_trace_setting "simple" '{"log_frequency":"2"}' +assert_curl_success "Failed to modify trace settings for 'simple' model" + +# Check if the current setting is returned (not specified setting from global) +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"6\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"-1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"2\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"global_trace.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +send_inference_requests "client_simple.log" 10 + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +if [ -f ./simple_trace.log ]; then + echo -e "\n***\n*** Test Failed, unexpected generation of simple_trace.log\n***" + RET=1 +fi + +$TRACE_SUMMARY -t global_trace.log.0 > summary_global_trace.log.0 + +if [ `grep -c "COMPUTE_INPUT_END" summary_global_trace.log.0` != "2" ]; then + cat summary_global_trace.log.0 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_global_trace.log.0` != "2" ]; then + cat summary_global_trace.log.0 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$TRACE_SUMMARY -t global_trace.log.1 > summary_global_trace.log.1 + +if [ `grep -c "COMPUTE_INPUT_END" summary_global_trace.log.1` != "1" ]; then + cat summary_global_trace.log.1 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_global_trace.log.1` != "1" ]; then + cat summary_global_trace.log.1 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# Update and clear model specific setting +SERVER_ARGS="--trace-config triton,file=global_trace.log --trace-config level=TIMESTAMPS --trace-config rate=6 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_off.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Add model setting and update it +update_trace_setting "simple" '{"trace_rate":"1"}' +assert_curl_success "Failed to modify trace settings for 'simple' model" + +update_trace_setting "simple" '{"trace_level":["OFF"]}' +assert_curl_success "Failed to modify trace settings for 'simple' model" + +# Check if the current setting is returned +if [ `grep -c "\"trace_level\":\[\"OFF\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"-1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"global_trace.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +# Send requests to simple where trace is explicitly disabled +send_inference_requests "client_update.log" 10 + +rm -f ./curl.out +set +e + +# Clear trace setting by explicitly asking removal for every field except 'trace_rate' +update_trace_setting "simple" '{"trace_level":null}' +assert_curl_success "Failed to modify trace settings for 'simple' model" + +# Check if the current setting (global) is returned +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"-1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"global_trace.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +# Send requests to simple where now uses global setting +send_inference_requests "client_clear.log" 5 + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +if [ -f ./update_trace.log ]; then + echo -e "\n***\n*** Test Failed, unexpected generation of update_trace.log\n***" + RET=1 +fi + +$TRACE_SUMMARY -t global_trace.log > summary_global_trace.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_global_trace.log` != "10" ]; then + cat summary_global_trace.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_global_trace.log` != "10" ]; then + cat summary_global_trace.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# Update trace count +SERVER_ARGS="--trace-config triton,file=global_count.log --trace-config level=TIMESTAMPS --trace-config rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_off.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# Send requests without trace count +send_inference_requests "client_update.log" 10 + +set -e + +# Check the current setting +get_trace_setting "simple" +assert_curl_success "Failed to obtain trace settings for 'simple' model" + +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"-1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"global_count.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +# Set trace count +update_global_trace_setting '{"trace_count":"5"}' +assert_curl_success "Failed to modify global trace settings" + +# Check if the current setting is returned +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"5\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"global_count.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +# Send requests to simple where trace is explicitly disabled +send_inference_requests "client_update.log" 10 + +# Check the current setting again and expect 'trace_count' becomes 0 +get_trace_setting "simple" +assert_curl_success "Failed to obtain trace settings for 'simple' model" + +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"global_count.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +# Check if the indexed file has been generated when trace count reaches 0 +if [ ! -f ./global_count.log.0 ]; then + echo -e "\n***\n*** Test Failed, expect generation of global_count.log.0 before stopping server\n***" + RET=1 +fi + +SETTINGS="trace_count trace_rate log_frequency" + +for SETTING in $SETTINGS; do + # Check `out of range` errors + update_trace_setting "simple" '{"'${SETTING}'":"10000000000"}' + assert_curl_failure "Server modified '${SETTING}' with an out of range value." +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +# There should be two trace files for trace counted requests and before trace +# counted requests +$TRACE_SUMMARY -t global_count.log > summary_global_count.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_global_count.log` != "20" ]; then + cat summary_global_count.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_global_count.log` != "20" ]; then + cat summary_global_count.log + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$TRACE_SUMMARY -t global_count.log.0 > summary_global_count.log.0 + +if [ `grep -c "COMPUTE_INPUT_END" summary_global_count.log.0` != "5" ]; then + cat summary_global_count.log.0 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +if [ `grep -c ^simple summary_global_count.log.0` != "5" ]; then + cat summary_global_count.log.0 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +# Test Python client library +SERVER_ARGS="--trace-config triton,file=global_unittest.log --trace-config level=TIMESTAMPS --trace-config rate=1 --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_unittest.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 + +set +e + +python $CLIENT_TEST >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + + +# Check `--trace-config` sets arguments properly +SERVER_ARGS="--trace-config=triton,file=bls_trace.log --trace-config=level=TIMESTAMPS \ + --trace-config=rate=4 --trace-config=count=6 --trace-config=mode=triton --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_trace_config.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +get_trace_setting "simple" +assert_curl_success "Failed to obtain trace settings for 'simple' model" + +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"4\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"6\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":\"bls_trace.log\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"triton\"" ./curl.out` != "1" ]; then + RET=1 +fi + +set +e +# Send bls requests to make sure simple model is traced +for p in {1..4}; do + python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1 +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +$TRACE_SUMMARY -t bls_trace.log > summary_bls.log + +if [ `grep -c "COMPUTE_INPUT_END" summary_bls.log` != "2" ]; then + cat summary_bls.log + echo -e "\n***\n*** Test Failed: Unexpected number of traced "COMPUTE_INPUT_END" events.\n***" + RET=1 +fi + +if [ `grep -c ^ensemble_add_sub_int32_int32_int32 summary_bls.log` != "1" ]; then + cat summary_bls.log + echo -e "\n***\n*** Test Failed: BLS child ensemble model wasn't traced. \n***" + RET=1 +fi + +if [ `grep -c ^simple summary_bls.log` != "1" ]; then + cat summary_bls.log + echo -e "\n***\n*** Test Failed: ensemble's model 'simple' wasn't traced. \n***" + RET=1 +fi + +if [ `grep -o 'parent_id' bls_trace.log | wc -l` != "2" ]; then + cat bls_trace.log + echo -e "\n***\n*** Test Failed: Unexpected number of 'parent id' fields. \n***" + RET=1 +fi + +# Attempt to trace non-existent model +SERVER_ARGS="--model-control-mode=explicit --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_nonexistent_model.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Explicitly load model +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/simple/load` +set -e +assert_curl_success "Failed to load 'simple' model" + +# Non-existent model (get) +get_trace_setting "does-not-exist" +assert_curl_failure "Server returned trace settings for a non-existent model" + +# Non-existent model (post) +update_trace_setting "does-not-exist" '{"log_frequency":"1"}' +assert_curl_failure "Server modified trace settings for a non-existent model" + +# Local model (get) +get_trace_setting "simple" +assert_curl_success "Failed to obtain trace settings for 'simple' model" + +# Local model (post) +update_trace_setting "simple" '{"log_frequency":"1"}' +assert_curl_success "Failed to modify trace settings for 'simple' model" + +# Local model (unload) +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/simple/unload` +set -e +assert_curl_success "Failed to unload 'simple' model" + +get_trace_setting "simple" +assert_curl_failure "Server returned trace settings for an unloaded model" + +update_trace_setting "simple" '{"log_frequency":"1"}' +assert_curl_failure "Server modified trace settings for an unloaded model" + +# Local model (reload) +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/simple/load` +set -e +assert_curl_success "Failed to load 'simple' model" + +get_trace_setting "simple" +assert_curl_success "Failed to obtain trace settings for 'simple' model" + +update_trace_setting "simple" '{"log_frequency":"1"}' +assert_curl_success "Failed to modify trace settings for 'simple' model" + +kill $SERVER_PID +wait $SERVER_PID + +set +e + +# Custom backend tracing +SERVER_ARGS="--model-control-mode=explicit --model-repository=$MODELSDIR + --load-model=custom_identity_int32 --trace-config=level=TIMESTAMPS \ + --trace-config=triton,file=custom_tracing_triton.log \ + --trace-config=rate=1 --trace-config=mode=triton" +SERVER_LOG="./custom_backend_tracing.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Send 1 inference request, should expect 3 custom activities: +# CUSTOM_SINGLE_ACTIVITY, CUSTOM_ACTIVITY_START, CUSTOM_ACTIVITY_END +rm -f ./curl.out +data='{"inputs":[{"name":"INPUT0","datatype":"INT32","shape":[1,1],"data":[4]}]}' +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/models/custom_identity_int32/infer -d ${data}` +set -e +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +set +e + + +$TRACE_SUMMARY -t custom_tracing_triton.log > summary_custom_tracing_triton.log + +if [ `grep -c "CUSTOM_SINGLE_ACTIVITY" summary_custom_tracing_triton.log` != "1" ]; then + cat summary_custom_tracing_triton.log + echo -e "\n***\n*** Test Failed: Unexpected number of traced "CUSTOM_ACTIVITY" events.\n***" + RET=1 +fi + +if [ `grep -c "CUSTOM_ACTIVITY_START" summary_custom_tracing_triton.log` != "1" ]; then + cat summary_custom_tracing_triton.log + echo -e "\n***\n*** Test Failed: Unexpected number of traced "CUSTOM_ACTIVITY_START" events.\n***" + RET=1 +fi + +if [ `grep -c "CUSTOM_ACTIVITY_END" summary_custom_tracing_triton.log` != "1" ]; then + cat summary_custom_tracing_triton.log + echo -e "\n***\n*** Test Failed: Unexpected number of traced "CUSTOM_ACTIVITY_END" events.\n***" + RET=1 +fi + +# Check opentelemetry trace exporter sends proper info. +# A helper python script starts listening on $OTLP_PORT, where +# OTLP exporter sends traces. +OTLP_PORT=10000 +OTEL_COLLECTOR=./otelcol +OTEL_COLLECTOR_LOG="./trace_collector_http_exporter.log" + +# Installing OpenTelemetry collector (v0.91.0). +# Ref: https://opentelemetry.io/docs/collector/getting-started/#local +curl --proto '=https' --tlsv1.2 -fOL https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.91.0/otelcol_0.91.0_linux_amd64.tar.gz +tar -xvf otelcol_0.91.0_linux_amd64.tar.gz + +rm collected_traces.json* +# Unittests then check that produced spans have expected format and events +OPENTELEMETRY_TEST=opentelemetry_unittest.py +OPENTELEMETRY_LOG="opentelemetry_unittest.log" +EXPECTED_NUM_TESTS="19" + +# Set up repo and args for SageMaker +export SAGEMAKER_TRITON_DEFAULT_MODEL_NAME="simple" +MODEL_PATH="/opt/ml/models/123456789abcdefghi/model" +rm -r ${MODEL_PATH} +mkdir -p "${MODEL_PATH}" +cp -r $DATADIR/$MODELBASE/* ${MODEL_PATH} && \ + rm -r ${MODEL_PATH}/2 && rm -r ${MODEL_PATH}/3 && \ + sed -i "s/onnx_int32_int32_int32/simple/" ${MODEL_PATH}/config.pbtxt + +# Add model to test trace context exposed to python backend +mkdir -p $MODELSDIR/trace_context/1 && cp ./trace_context.py $MODELSDIR/trace_context/1/model.py + +# set up identity model +rm -r ${MODELSDIR}/custom_identity_int32 +mkdir -p $MODELSDIR/custom_identity_int32/1 && (cd $MODELSDIR/custom_identity_int32 && \ + echo 'name: "custom_identity_int32"' >> config.pbtxt && \ + echo 'backend: "identity"' >> config.pbtxt && \ + echo 'max_batch_size: 1024' >> config.pbtxt && \ + echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \ + echo 'instance_group [{ kind: KIND_CPU }]' >> config.pbtxt && \ + echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "500" } }, { key: "enable_custom_tracing" \n value: { string_value: "true" } }, { key: "nested_span_count" \n value: { string_value: "6" } }, { key: "single_activity_frequency" \n value: { string_value: "3" } }]' >> config.pbtxt) + +SERVER_ARGS="--allow-sagemaker=true --model-control-mode=explicit \ + --load-model=simple --load-model=ensemble_add_sub_int32_int32_int32 \ + --load-model=repeat_int32 --load-model=custom_identity_int32\ + --load-model=input_all_required \ + --load-model=dynamic_batch \ + --load-model=bls_simple --trace-config=level=TIMESTAMPS \ + --load-model=trace_context --trace-config=rate=1 \ + --trace-config=count=-1 --trace-config=mode=opentelemetry \ + --trace-config=opentelemetry,resource=test.key=test.value \ + --trace-config=opentelemetry,resource=service.name=test_triton \ + --trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \ + --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_otel_otelcol_exporter.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +python $OPENTELEMETRY_TEST >>$OPENTELEMETRY_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $OPENTELEMETRY_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $OPENTELEMETRY_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e +kill $SERVER_PID +wait $SERVER_PID +set +e + +# Testing OTel WAR with trace rate = 0 +rm collected_traces.json + +OTEL_COLLECTOR=./otelcol +OTEL_COLLECTOR_LOG="./trace_collector_exporter.log" +$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$! + +SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=0\ + --trace-config=count=-1 --trace-config=mode=opentelemetry \ + --trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \ + --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_otel_WAR.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +get_trace_setting "bls_simple" +assert_curl_success "Failed to obtain trace settings for 'simple' model" + +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"0\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"-1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"opentelemetry\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"url\":\"localhost:$OTLP_PORT/v1/traces\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"bsp_max_export_batch_size\":\"512\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"bsp_schedule_delay\":\"5000\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"bsp_max_queue_size\":\"2048\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":" ./curl.out` != "0" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":" ./curl.out` != "0" ]; then + RET=1 +fi + + +set +e +# Send bls requests to make sure bls_simple model is NOT traced +for p in {1..10}; do + python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1 +done + +if [ -s collected_traces.json ] ; then + echo -e "\n***\n*** collected_traces.json should be empty, but it is not.\n***" + exit 1 +fi + +# Send 1 bls request with OTel context to make sure it is traced +python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32", \ + headers={"traceparent": "00-0af7651916cd43dd8448eb211c12666c-b7ad6b7169242424-01"} \ + )' >> client_update.log 2>&1 + +sleep 20 + +if ! [ -s collected_traces.json ] ; then + echo -e "\n***\n*** collected_traces.json should contain OTel trace, but it is not. \n***" + exit 1 +fi + +set -e +kill $COLLECTOR_PID +wait $COLLECTOR_PID +kill $SERVER_PID +wait $SERVER_PID +set +e + +# Test that only traces with OTel Context are collected after count goes to 0 +SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=5\ + --trace-config=count=1 --trace-config=mode=opentelemetry \ + --trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \ + --model-repository=$MODELSDIR" +SERVER_LOG="./inference_server_otel_WAR.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + + +rm collected_traces.json +$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$! + +get_trace_setting "bls_simple" +assert_curl_success "Failed to obtain trace settings for 'simple' model" + +if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_rate\":\"5\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_count\":\"1\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_mode\":\"opentelemetry\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"url\":\"localhost:$OTLP_PORT/v1/traces\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"bsp_max_export_batch_size\":\"512\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"bsp_schedule_delay\":\"5000\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"bsp_max_queue_size\":\"2048\"" ./curl.out` != "1" ]; then + RET=1 +fi +if [ `grep -c "\"trace_file\":" ./curl.out` != "0" ]; then + RET=1 +fi +if [ `grep -c "\"log_frequency\":" ./curl.out` != "0" ]; then + RET=1 +fi + +set +e +# Send bls requests to make sure bls_simple model is NOT traced +for p in {1..20}; do + python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1 +done + +sleep 20 + +if ! [[ -s collected_traces.json && `grep -c "\"name\":\"InferRequest\"" ./collected_traces.json` == 1 && `grep -c "\"parentSpanId\":\"\"" ./collected_traces.json` == 1 ]] ; then + echo -e "\n***\n*** collected_traces.json should contain only 1 trace.\n***" + cat collected_traces.json + exit 1 +fi + +# Send 4 bls request with OTel context and 4 without to make sure it is traced +for p in {1..10}; do + python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32", \ + headers={"traceparent": "00-0af7651916cd43dd8448eb211c12666c-b7ad6b7169242424-01"} \ + )' >> client_update.log 2>&1 + + python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32" \ + )' >> client_update.log 2>&1 + + sleep 10 +done + +if ! [[ -s collected_traces.json && `grep -c "\"parentSpanId\":\"\"" ./collected_traces.json` == 1 && `grep -c "\"parentSpanId\":\"b7ad6b7169242424\"" ./collected_traces.json` == 10 ]] ; then + echo -e "\n***\n*** collected_traces.json should contain 11 OTel trace, but it is not. \n***" + exit 1 +fi + +set -e +kill $COLLECTOR_PID +wait $COLLECTOR_PID +kill $SERVER_PID +wait $SERVER_PID +set +e + +################################################################################ +# Tests to make sure BatchSpanProcessor's arguments are propagated from cmd # +# to trace initialization step. # +################################################################################ + +# bsp_max_queue_size = 1 +# We are sending a bls request, that results in a trace with 6 spans, +# but because `bsp_max_queue_size` is 1, OTel should drop some of them +# and print a warning in a log. +EXPECTED_WARNING="BatchSpanProcessor queue is full - dropping span." +SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1\ + --trace-config=count=-1 --trace-config=mode=opentelemetry \ + --trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \ + --trace-config opentelemetry,bsp_max_queue_size=1 + --model-repository=$MODELSDIR --log-verbose=1" +SERVER_LOG="./inference_server_otel_BSP_max_queue_size.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm collected_traces.json +$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$! + +set +e +python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1 + +sleep 20 + +if ! [[ `grep -c "$EXPECTED_WARNING" $SERVER_LOG` > 0 ]] ; then + echo -e "\n***\n*** $SERVER_LOG does not contain expected BSP warning.\n***" + cat $SERVER_LOG + exit 1 +fi + +set -e +kill $COLLECTOR_PID +wait $COLLECTOR_PID +kill $SERVER_PID +wait $SERVER_PID +set +e + +# bsp_schedule_delay = 0 +# We are sending a bls request, that results in a trace with 6 spans. +# `bsp_schedule_delay` is 0, so OTel should export traces in batches of random +# size, that translates into random number of 'scopeSpans' field in +# `collected_traces.json`. +SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1\ + --trace-config=count=-1 --trace-config=mode=opentelemetry \ + --trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \ + --trace-config opentelemetry,bsp_schedule_delay=0 + --model-repository=$MODELSDIR --log-verbose=1" +SERVER_LOG="./inference_server_otel_BSP_schedule_delay.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm collected_traces.json +$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$! + +set +e +python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1 + +sleep 10 + +if ! [[ -s collected_traces.json && `grep -o "scopeSpans" ./collected_traces.json | wc -l` > 1 ]] ; then + echo -e "\n***\n*** collected_traces.json has unexpected number of span batches collected.\n***" + cat collected_traces.json + exit 1 +fi + +set -e +kill $COLLECTOR_PID +wait $COLLECTOR_PID +kill $SERVER_PID +wait $SERVER_PID +set +e + +# bsp_max_export_batch_size = 1 +# We are sending a bls request, that results in a trace with 6 spans. +# `bsp_max_export_batch_size` is 1, so OTel should export traces in batches of +# size 1, that translates into 6 entries of 'scopeSpans' field in +# `collected_traces.json`. +SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1\ + --trace-config=count=-1 --trace-config=mode=opentelemetry \ + --trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \ + --trace-config opentelemetry,bsp_max_export_batch_size=1 + --model-repository=$MODELSDIR --log-verbose=1" +SERVER_LOG="./inference_server_otel_BSP_max_export_batch_size.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm collected_traces.json +$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$! + +set +e +python -c 'import opentelemetry_unittest; \ + opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1 + +sleep 10 + +if ! [[ -s collected_traces.json && `grep -o "scopeSpans" ./collected_traces.json | wc -l` == 6 ]] ; then + echo -e "\n***\n*** collected_traces.json has unexpected number of span batches collected.\n***" + cat collected_traces.json + exit 1 +fi + +set -e +kill $COLLECTOR_PID +wait $COLLECTOR_PID +kill $SERVER_PID +wait $SERVER_PID +set +e + +# Test that PBE returns None as trace context in trace mode Triton +SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1\ + --trace-config=count=-1 --trace-config=mode=triton \ + --model-repository=$MODELSDIR --log-verbose=1" +SERVER_LOG="./inference_server_triton_trace_context.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +check_pbe_trace_context "trace_context" true +assert_curl_success "PBE trace context is not None" + +set -e +kill $SERVER_PID +wait $SERVER_PID +set +e + +# Test that PBE returns None as trace context in trace mode OpenTelemetry, +# but traceing is OFF. +SERVER_ARGS="--trace-config=level=OFF --trace-config=rate=1\ + --trace-config=count=-1 --trace-config=mode=opentelemetry \ + --model-repository=$MODELSDIR --log-verbose=1" +SERVER_LOG="./inference_server_triton_trace_context.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +check_pbe_trace_context "trace_context" true +assert_curl_success "PBE trace context is not None" + +set -e +kill $SERVER_PID +wait $SERVER_PID +set +e +exit $RET diff --git a/qa/L0_trace/trace-config.yaml b/qa/L0_trace/trace-config.yaml new file mode 100644 index 0000000000..2948058adf --- /dev/null +++ b/qa/L0_trace/trace-config.yaml @@ -0,0 +1,51 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Simple config file for OpenTelemetry collector. +# It receives all traces, received on localhost:10000 and prints +# it into the output stream. +# Ref: https://opentelemetry.io/docs/collector/configuration/ +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:10000 + +processors: + batch: + send_batch_size: 10 + timeout: 10s + +exporters: + file: + path: ./collected_traces.json + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [file] diff --git a/qa/L0_trace/trace_context.py b/qa/L0_trace/trace_context.py new file mode 100644 index 0000000000..db2db29ce8 --- /dev/null +++ b/qa/L0_trace/trace_context.py @@ -0,0 +1,72 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + inputs = [{"name": "expect_none", "data_type": "TYPE_BOOL", "dims": [1]}] + outputs = [{"name": "OUTPUT0", "data_type": "TYPE_STRING", "dims": [-1]}] + + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + for input in config["input"]: + input_names.append(input["name"]) + for output in config["output"]: + output_names.append(output["name"]) + + for input in inputs: + if input["name"] not in input_names: + auto_complete_model_config.add_input(input) + for output in outputs: + if output["name"] not in output_names: + auto_complete_model_config.add_output(output) + + return auto_complete_model_config + + def execute(self, requests): + responses = [] + for request in requests: + expect_none = pb_utils.get_input_tensor_by_name( + request, "expect_none" + ).as_numpy()[0] + context = request.trace().get_context() + if expect_none and context is not None: + raise pb_utils.TritonModelException("Context should be None") + if not expect_none and context is None: + raise pb_utils.TritonModelException("Context should NOT be None") + + output_tensor = pb_utils.Tensor( + "OUTPUT0", np.array(context).astype(np.bytes_) + ) + inference_response = pb_utils.InferenceResponse([output_tensor]) + responses.append(inference_response) + + return responses diff --git a/qa/L0_trace/trace_endpoint_test.py b/qa/L0_trace/trace_endpoint_test.py new file mode 100755 index 0000000000..f15d2b4e75 --- /dev/null +++ b/qa/L0_trace/trace_endpoint_test.py @@ -0,0 +1,478 @@ +#!/usr/bin/python + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import json +import sys +import unittest + +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from google.protobuf import json_format +from tritonclient.utils import InferenceServerException + + +# Similar set up as dynamic batcher tests +class TraceEndpointTest(tu.TestResultCollector): + def tearDown(self): + # Clear all trace settings to initial state. + # Note that the tearDown function uses HTTP client so the pass/fail + # of the HTTP trace setting test cases should be checked to make sure + # tearDown() is properly executed and not affecting start state of + # other test cases + clear_settings = { + "trace_level": None, + "trace_rate": None, + "trace_count": None, + "log_frequency": None, + } + triton_client = httpclient.InferenceServerClient("localhost:8000") + triton_client.update_trace_settings( + model_name="simple", settings=clear_settings + ) + triton_client.update_trace_settings(model_name=None, settings=clear_settings) + + def check_server_initial_state(self): + # Helper function to make sure the trace setting is properly + # initialized / reset before actually running the test case. + # Note that this function uses HTTP client so the pass/fail of + # the HTTP trace setting test cases should be checked to make sure + # the initial state is checked properly before running other test cases. + initial_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["TIMESTAMPS"], + "trace_rate": "1", + "trace_count": "-1", + "log_frequency": "0", + "trace_mode": "triton", + } + triton_client = httpclient.InferenceServerClient("localhost:8000") + self.assertEqual( + initial_settings, triton_client.get_trace_settings(model_name="simple") + ) + self.assertEqual(initial_settings, triton_client.get_trace_settings()) + + def test_http_get_settings(self): + # Model trace settings will be the same as global trace settings since + # no update has been made. + initial_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["TIMESTAMPS"], + "trace_rate": "1", + "trace_count": "-1", + "log_frequency": "0", + "trace_mode": "triton", + } + triton_client = httpclient.InferenceServerClient("localhost:8000") + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected initial model trace settings", + ) + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(), + "Unexpected initial global settings", + ) + try: + triton_client.get_trace_settings(model_name="does-not-exist") + except Exception as ex: + self.assertIn( + "Request for unknown model : does-not-exist", + ex.message(), + ) + + def test_grpc_get_settings(self): + # Model trace settings will be the same as global trace settings since + # no update has been made. + initial_settings = grpcclient.service_pb2.TraceSettingResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["TIMESTAMPS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "trace_mode": {"value": ["triton"]}, + "log_frequency": {"value": ["0"]}, + } + } + ), + initial_settings, + ) + + triton_client = grpcclient.InferenceServerClient("localhost:8001") + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected initial model trace settings", + ) + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(), + "Unexpected initial global settings", + ) + try: + triton_client.get_trace_settings(model_name="does-not-exist") + except Exception as ex: + self.assertIn( + "Request for unknown model : does-not-exist", + ex.message(), + ) + + def test_http_update_settings(self): + # Update model and global trace settings in order, + # and expect the global trace settings will only reflect to + # the model setting fields that haven't been specified. + self.check_server_initial_state() + + expected_first_model_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["TIMESTAMPS"], + "trace_rate": "1", + "trace_count": "-1", + "log_frequency": "0", + "trace_mode": "triton", + } + expected_first_model_response = { + "error": "trace file location can not be updated through network protocol" + } + expected_second_model_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["TIMESTAMPS", "TENSORS"], + "trace_rate": "1", + "trace_count": "-1", + "log_frequency": "0", + "trace_mode": "triton", + } + expected_global_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["TIMESTAMPS", "TENSORS"], + "trace_rate": "1", + "trace_count": "-1", + "log_frequency": "0", + "trace_mode": "triton", + } + + model_update_settings = {"trace_file": "model.log"} + global_update_settings = { + "trace_level": ["TIMESTAMPS", "TENSORS"], + } + + triton_client = httpclient.InferenceServerClient("localhost:8000") + with self.assertRaisesRegex( + InferenceServerException, expected_first_model_response["error"] + ) as e: + triton_client.update_trace_settings( + model_name="simple", settings=model_update_settings + ) + self.assertEqual( + expected_first_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global update", + ) + # Note that 'trace_level' may be mismatch due to the order of + # the levels listed, currently we assume the order is the same + # for simplicity. But the order shouldn't be enforced and this checking + # needs to be improved when this kind of failure is reported + self.assertEqual( + expected_global_settings, + triton_client.update_trace_settings(settings=global_update_settings), + "Unexpected updated global settings", + ) + self.assertEqual( + expected_second_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global update", + ) + try: + triton_client.update_trace_settings( + model_name="does-not-exist", settings=model_update_settings + ) + except Exception as ex: + self.assertIn( + "Request for unknown model : does-not-exist", + ex.message(), + ) + + def test_grpc_update_settings(self): + # Update model and global trace settings in order, + # and expect the global trace settings will only reflect to + # the model setting fields that haven't been specified. + self.check_server_initial_state() + + expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["TIMESTAMPS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + "trace_mode": {"value": ["triton"]}, + } + } + ), + expected_first_model_settings, + ) + + expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["TIMESTAMPS", "TENSORS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + "trace_mode": {"value": ["triton"]}, + } + } + ), + expected_second_model_settings, + ) + + expected_global_settings = grpcclient.service_pb2.TraceSettingResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["TIMESTAMPS", "TENSORS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + "trace_mode": {"value": ["triton"]}, + } + } + ), + expected_global_settings, + ) + + model_update_settings = {"trace_file": "model.log"} + global_update_settings = { + "trace_level": ["TIMESTAMPS", "TENSORS"], + } + + triton_client = grpcclient.InferenceServerClient("localhost:8001") + # Note that 'trace_level' may be mismatch due to the order of + # the levels listed, currently we assume the order is the same + # for simplicity. But the order shouldn't be enforced and this checking + # needs to be improved when this kind of failure is reported + self.assertEqual( + expected_global_settings, + triton_client.update_trace_settings(settings=global_update_settings), + "Unexpected updated global settings", + ) + self.assertEqual( + expected_second_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global update", + ) + try: + triton_client.update_trace_settings( + model_name="does-not-exist", settings=model_update_settings + ) + except Exception as ex: + self.assertIn( + "Request for unknown model : does-not-exist", + ex.message(), + ) + + def test_http_clear_settings(self): + # Clear global and model trace settings in order, + # and expect the default / global trace settings are + # propagated properly. + self.check_server_initial_state() + + # First set up the model / global trace setting that: + # model 'simple' has 'trace_rate' and 'log_frequency' specified + # global has 'trace_level', 'trace_count' and 'trace_rate' specified + triton_client = httpclient.InferenceServerClient("localhost:8000") + triton_client.update_trace_settings( + model_name="simple", settings={"trace_rate": "12", "log_frequency": "34"} + ) + triton_client.update_trace_settings( + settings={"trace_rate": "56", "trace_count": "78", "trace_level": ["OFF"]} + ) + + expected_global_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["OFF"], + "trace_rate": "1", + "trace_count": "-1", + "log_frequency": "0", + "trace_mode": "triton", + } + expected_first_model_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["OFF"], + "trace_rate": "12", + "trace_count": "-1", + "log_frequency": "34", + "trace_mode": "triton", + } + expected_second_model_settings = { + "trace_file": "global_unittest.log", + "trace_level": ["OFF"], + "trace_rate": "1", + "trace_count": "-1", + "log_frequency": "34", + "trace_mode": "triton", + } + global_clear_settings = {"trace_rate": None, "trace_count": None} + model_clear_settings = {"trace_rate": None, "trace_level": None} + + # Clear global + self.assertEqual( + expected_global_settings, + triton_client.update_trace_settings(settings=global_clear_settings), + "Unexpected cleared global trace settings", + ) + self.assertEqual( + expected_first_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global clear", + ) + self.assertEqual( + expected_second_model_settings, + triton_client.update_trace_settings( + model_name="simple", settings=model_clear_settings + ), + "Unexpected model trace settings after model clear", + ) + self.assertEqual( + expected_global_settings, + triton_client.get_trace_settings(), + "Unexpected global trace settings after model clear", + ) + + def test_grpc_clear_settings(self): + # Clear global and model trace settings in order, + # and expect the default / global trace settings are + # propagated properly. + self.check_server_initial_state() + + # First set up the model / global trace setting that: + # model 'simple' has 'trace_rate' and 'log_frequency' specified + # global has 'trace_level', 'trace_count' and 'trace_rate' specified + triton_client = grpcclient.InferenceServerClient("localhost:8001") + triton_client.update_trace_settings( + model_name="simple", settings={"trace_rate": "12", "log_frequency": "34"} + ) + triton_client.update_trace_settings( + settings={"trace_rate": "56", "trace_count": "78", "trace_level": ["OFF"]} + ) + + expected_global_settings = grpcclient.service_pb2.TraceSettingResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["OFF"]}, + "trace_mode": {"value": ["triton"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + } + } + ), + expected_global_settings, + ) + expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["OFF"]}, + "trace_rate": {"value": ["12"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["34"]}, + "trace_mode": {"value": ["triton"]}, + } + } + ), + expected_first_model_settings, + ) + expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse() + json_format.Parse( + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["OFF"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["34"]}, + "trace_mode": {"value": ["triton"]}, + } + } + ), + expected_second_model_settings, + ) + + global_clear_settings = {"trace_rate": None, "trace_count": None} + model_clear_settings = {"trace_rate": None, "trace_level": None} + + # Clear global + self.assertEqual( + expected_global_settings, + triton_client.update_trace_settings(settings=global_clear_settings), + "Unexpected cleared global trace settings", + ) + self.assertEqual( + expected_first_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global clear", + ) + self.assertEqual( + expected_second_model_settings, + triton_client.update_trace_settings( + model_name="simple", settings=model_clear_settings + ), + "Unexpected model trace settings after model clear", + ) + self.assertEqual( + expected_global_settings, + triton_client.get_trace_settings(), + "Unexpected global trace settings after model clear", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_triton_repo_agent/models/chain_relocation/config.pbtxt b/qa/L0_triton_repo_agent/models/chain_relocation/config.pbtxt new file mode 100644 index 0000000000..8c8cc8287c --- /dev/null +++ b/qa/L0_triton_repo_agent/models/chain_relocation/config.pbtxt @@ -0,0 +1,49 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +model_repository_agents +{ + agents [ + { + name: "relocation", + parameters [ + { + key: "empty_config", + value: "false" + } + ] + }, + { + name: "relocation", + parameters [ + { + key: "empty_config", + value: "true" + } + ] + } + ] +} \ No newline at end of file diff --git a/qa/L0_triton_repo_agent/models/relocation_sanity_check/config.pbtxt b/qa/L0_triton_repo_agent/models/relocation_sanity_check/config.pbtxt new file mode 100644 index 0000000000..9478be13cb --- /dev/null +++ b/qa/L0_triton_repo_agent/models/relocation_sanity_check/config.pbtxt @@ -0,0 +1,40 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +model_repository_agents +{ + agents [ + { + name: "relocation", + parameters [ + { + key: "empty_config", + value: "true" + } + ] + } + ] +} \ No newline at end of file diff --git a/qa/L0_triton_repo_agent/test.sh b/qa/L0_triton_repo_agent/test.sh new file mode 100755 index 0000000000..7dbc559891 --- /dev/null +++ b/qa/L0_triton_repo_agent/test.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +source ../common/util.sh + +RET=0 + +TEST_LOG="./triton_repo_agent_test.log" +TRITON_REPO_AGENT_TEST=./repo_agent_test + + +export CUDA_VISIBLE_DEVICES=0 + +rm -fr *.log + +set +e +$TRITON_REPO_AGENT_TEST >>$TEST_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Repo Agent Unit Test Failed\n***" + RET=1 +fi +set -e + +rm -rf /opt/tritonserver/repoagents/relocation +mkdir -p /opt/tritonserver/repoagents/relocation && + cp libtritonrepoagent_relocation.so /opt/tritonserver/repoagents/relocation/. + +SERVER=/opt/tritonserver/bin/tritonserver + +SERVER_ARGS="--model-repository=`pwd`/models" +SERVER_LOG="./inference_server.log" +run_server +if [ "$SERVER_PID" != "0" ]; then + kill $SERVER_PID + wait $SERVER_PID + + echo -e "\n***\n*** Expect fail to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +grep "Poll failed for model directory 'relocation_sanity_check': Relocation repoagent expects config does not contain 'model_repository_agents' field when 'empty_config' has value 'true' for relocation agent" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected repo agent of 'relocation_sanity_check' returns error on load\n***" + RET=1 +fi +grep "Poll failed for model directory 'chain_relocation': Relocation repoagent" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected repo agent of 'chain_relocation' returns success on load\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $TEST_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_trt_bf16_dtype/test.sh b/qa/L0_trt_bf16_dtype/test.sh new file mode 100755 index 0000000000..da787bc41a --- /dev/null +++ b/qa/L0_trt_bf16_dtype/test.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +source ../common/util.sh + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +RET=0 +TRT_TEST="trt_bf16_dtype_test.py" +TEST_RESULT_FILE="./test_results.txt" +SERVER=/opt/tritonserver/bin/tritonserver + +rm -rf ./fixed_models/ ./dynamic_models/ *.log* && mkdir ./fixed_models/ ./dynamic_models/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/plan_*bf16_bf16_bf16 ./fixed_models/ +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_*bf16_bf16_bf16 ./dynamic_models/ + +for TEST in "fixed" "dynamic"; do + MODELDIR="./${TEST}_models" + CLIENT_LOG="./${TEST}_client.log" + SERVER_LOG="./${TEST}_inference_server.log" + SERVER_ARGS="--model-repository=${MODELDIR} --log-verbose=1" + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + python3 $TRT_TEST TrtBF16DataTypeTest.test_${TEST} >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Running $TRT_TEST TrtBF16DataTypeTest.test_${TEST} Failed\n***" + cat $CLIENT_LOG + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_trt_bf16_dtype/trt_bf16_dtype_test.py b/qa/L0_trt_bf16_dtype/trt_bf16_dtype_test.py new file mode 100755 index 0000000000..265c1930b0 --- /dev/null +++ b/qa/L0_trt_bf16_dtype/trt_bf16_dtype_test.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.http as client + + +class TrtBF16DataTypeTest(tu.TestResultCollector): + def setUp(self): + self.triton_client = client.InferenceServerClient( + "localhost:8000", verbose=True + ) + + def _infer_helper(self, model_name, shape): + inputs = [] + outputs = [] + inputs.append(client.InferInput("INPUT0", shape, "BF16")) + inputs.append(client.InferInput("INPUT1", shape, "BF16")) + + input0_data = np.ones(shape=shape).astype(np.float32) + input1_data = np.ones(shape=shape).astype(np.float32) + + inputs[0].set_data_from_numpy(input0_data, binary_data=True) + inputs[1].set_data_from_numpy(input1_data, binary_data=True) + + outputs.append(client.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(client.InferRequestedOutput("OUTPUT1", binary_data=True)) + + results = self.triton_client.infer(model_name, inputs, outputs=outputs) + + output0_data = results.as_numpy("OUTPUT0") + output1_data = results.as_numpy("OUTPUT1") + + np.testing.assert_equal( + output0_data, + input0_data + input1_data, + "Result output does not match the expected output", + ) + np.testing.assert_equal( + output1_data, + input0_data - input1_data, + "Result output does not match the expected output", + ) + + def test_fixed(self): + for bs in [1, 4, 8]: + self._infer_helper( + "plan_bf16_bf16_bf16", + [bs, 16], + ) + + self._infer_helper( + "plan_nobatch_bf16_bf16_bf16", + [16], + ) + + def test_dynamic(self): + for bs in [1, 4, 8]: + self._infer_helper( + "plan_bf16_bf16_bf16", + [bs, 16, 16], + ) + + self._infer_helper( + "plan_nobatch_bf16_bf16_bf16", + [16, 16], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_compat/test.sh b/qa/L0_trt_compat/test.sh new file mode 100755 index 0000000000..a8161369df --- /dev/null +++ b/qa/L0_trt_compat/test.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi + +TEST_RESULT_FILE='test_results.txt' +COMPATIBILITY_TEST_PY=trt_compatibility_test.py +CLIENT_LOG="client.log" +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --exit-timeout-secs=120" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -fr models && mkdir models +cp -r $DATADIR/qa_identity_model_repository/plan_compatible_zero_1_float32 models/. + +RET=0 + +if [ `ps | grep -c "tritonserver"` != "0" ]; then + echo -e "Tritonserver already running" + echo -e `ps | grep tritonserver` + exit 1 +fi + +run_server +if [ "$SERVER_PID" != "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** FAILED: unexpected server start (version compatibility disabled): $SERVER\n***" >> $CLIENT_LOG + kill $SERVER_PID + wait $SERVER_PID + exit 1 +fi + +EXPECTED_ERR="Cannot deserialize engine with lean runtime" +if ! grep "$EXPECTED_ERR" $SERVER_LOG; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to find expected error: ${EXPECTED_ERR} \n***" + RET=1 +fi + +SERVER_ARGS="--model-repository=`pwd`/models --exit-timeout-secs=120 --backend-config=tensorrt,version-compatible=true" + +run_server +if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** FAILED: unsuccessful server start (version compatibility enabled): $SERVER\n***" + exit 1 +fi + +set +e + +python $COMPATIBILITY_TEST_PY >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_trt_compat/trt_compatibility_test.py b/qa/L0_trt_compat/trt_compatibility_test.py new file mode 100755 index 0000000000..6991299a4c --- /dev/null +++ b/qa/L0_trt_compat/trt_compatibility_test.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu + + +class TrtCompatibilityTest(tu.TestResultCollector): + def setUp(self): + self._data_type = np.float32 + + def test_plan(self): + # plan_compatible_zero_1_float32 is an identity model with input shape [-1] + iu.infer_zero(self, "plan_compatible", 1, self._data_type, [[2, 4]], [[2, 4]]) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_data_dependent_shape/test.sh b/qa/L0_trt_data_dependent_shape/test.sh new file mode 100755 index 0000000000..61efb053f8 --- /dev/null +++ b/qa/L0_trt_data_dependent_shape/test.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' +export CUDA_VISIBLE_DEVICES=0 + +TRT_TEST=trt_data_dependent_shape_test.py + +DATADIR="./models" + +rm -rf ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_trt_data_dependent_model_repository/ ${DATADIR} + +source ../common/util.sh + +rm -f *.log* + +RET=0 + +CLIENT_LOG="./client.log" +SERVER_LOG="./inference_server.log" +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_TEST >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py b/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py new file mode 100755 index 0000000000..ee0b675d84 --- /dev/null +++ b/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.http as client + + +class TrtDataDependentShapeTest(tu.TestResultCollector): + def setUp(self): + self.triton_client = client.InferenceServerClient( + "localhost:8000", verbose=True + ) + + def test_fixed(self): + model_name = "plan_nobatch_nonzero_fixed" + input_np = np.arange(16, dtype=np.int32).reshape((4, 4)) + expected_output_np = np.nonzero(input_np) + + inputs = [] + inputs.append(client.InferInput("INPUT", [4, 4], "INT32")) + inputs[-1].set_data_from_numpy(input_np) + + results = self.triton_client.infer(model_name=model_name, inputs=inputs) + # Validate the results by comparing with precomputed values. + output_np = results.as_numpy("OUTPUT") + self.assertTrue( + np.array_equal(output_np, expected_output_np), + "OUTPUT expected: {}, got {}".format(expected_output_np, output_np), + ) + + def test_dynamic(self): + model_name = "plan_nobatch_nonzero_dynamic" + input_data = [] + for i in range(20 * 16): + input_data.append(i if (i % 2) == 0 else 0) + input_np = np.array(input_data, dtype=np.int32).reshape((20, 16)) + expected_output_np = np.nonzero(input_np) + + inputs = [] + inputs.append(client.InferInput("INPUT", [20, 16], "INT32")) + inputs[-1].set_data_from_numpy(input_np) + + results = self.triton_client.infer(model_name=model_name, inputs=inputs) + # Validate the results by comparing with precomputed values. + output_np = results.as_numpy("OUTPUT") + self.assertTrue( + np.array_equal(output_np, expected_output_np), + "OUTPUT expected: {}, got {}".format(expected_output_np, output_np), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_dla/dla_test.py b/qa/L0_trt_dla/dla_test.py new file mode 100755 index 0000000000..d71d277ac4 --- /dev/null +++ b/qa/L0_trt_dla/dla_test.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import numpy as np +import test_util as tu +import tritonclient.http as httpclient +from PIL import Image + + +class InferTest(tu.TestResultCollector): + def _preprocess(self, img, dtype): + """ + Pre-process an image to meet the size and type + requirements specified by the parameters. + """ + + sample_img = img.convert("RGB") + resized_img = sample_img.resize((224, 224), Image.BILINEAR) + resized = np.array(resized_img) + + typed = resized.astype(dtype) + scaled = typed - np.asarray((123, 117, 104), dtype=dtype) + ordered = np.transpose(scaled, (2, 0, 1)) + + return ordered + + def test_resnet50(self): + try: + triton_client = httpclient.InferenceServerClient(url="localhost:8000") + except Exception as e: + print("channel creation failed: " + str(e)) + sys.exit(1) + + image_filename = "../images/vulture.jpeg" + model_name = "resnet50_plan" + batch_size = 32 + + img = Image.open(image_filename) + image_data = self._preprocess(img, np.int8) + image_data = np.expand_dims(image_data, axis=0) + + batched_image_data = image_data + for i in range(1, batch_size): + batched_image_data = np.concatenate( + (batched_image_data, image_data), axis=0 + ) + + inputs = [ + httpclient.InferInput("input_tensor_0", [batch_size, 3, 224, 224], "INT8") + ] + inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) + + outputs = [ + httpclient.InferRequestedOutput("topk_layer_output_index", binary_data=True) + ] + + results = triton_client.infer(model_name, inputs, outputs=outputs) + + output_data = results.as_numpy("topk_layer_output_index") + print(output_data) + + # Validate the results by comparing with precomputed values. + # VULTURE class corresponds with index 23 + EXPECTED_CLASS_INDEX = 23 + for i in range(batch_size): + self.assertEqual(output_data[i][0][0], EXPECTED_CLASS_INDEX) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_dla/test.sh b/qa/L0_trt_dla/test.sh new file mode 100755 index 0000000000..5c57d447c4 --- /dev/null +++ b/qa/L0_trt_dla/test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +# Need to run on only one device since only creating a single +# PLAN. Without this test will fail on a heterogeneous system. +export CUDA_VISIBLE_DEVICES=0 + +# Only need to set paths for jetson since this test runs only on jetson +TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} +DLA_TEST=./dla_test.py + +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +SERVER=${TRITON_DIR}/bin/tritonserver +BACKEND_DIR=${TRITON_DIR}/backends + +SERVER_ARGS="--model-repository=`pwd`/models --exit-timeout-secs=120 --backend-directory=${BACKEND_DIR}" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -fr models && mkdir models +cp -r $DATADIR/trt_dla_model_store/resnet50_plan models/. +rm -f *.log + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +RET=0 +CLIENT_LOG=client.log + +set +e + +python3 $DLA_TEST >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +rm -rf models + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_trt_dynamic_shape/test.sh b/qa/L0_trt_dynamic_shape/test.sh new file mode 100755 index 0000000000..43a39dd199 --- /dev/null +++ b/qa/L0_trt_dynamic_shape/test.sh @@ -0,0 +1,407 @@ +#!/bin/bash +# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +PERF_CLIENT=../clients/perf_client +TRT_OP_TEST=trt_dynamic_shape_test.py + +DATADIR="./models" + +rm -rf ${DATADIR} +mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_float32_float32_float32-4-32 ${DATADIR}/ + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -f *.log* + +RET=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Shape beyond the limits of optimization profile +set +e +$PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32-4-32 --shape INPUT0:33 --shape INPUT1:33 -t 1 -p2000 -b 1 > ${CLIENT_LOG}_max 2>&1 +if [ $? -eq 0 ]; then + cat ${CLIENT_LOG}_max + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +EXPECTED_MESSAGE="model expected the shape of dimension 1 to be between 4 and 32 but received" +if [ $(cat ${CLIENT_LOG}_max | grep "${EXPECTED_MESSAGE} 33" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}_max + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32-4-32 --shape INPUT0:3 --shape INPUT1:3 -t 1 -p2000 -b 1 > ${CLIENT_LOG}_min 2>&1 +if [ $? -eq 0 ]; then + cat ${CLIENT_LOG}_min + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat ${CLIENT_LOG}_min | grep "${EXPECTED_MESSAGE} 3" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}_min + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Tests with multiple optimization profiles + +# plan_float32_float32_float32 models with dynamic shapes has 9 profiles +# min, opt, max, idx +# [1, 1], [1, 16], [8, 33], 0 (*) +# [1, 1], [2, 16], [7, 32], 1 +# [1, 1], [3, 16], [6, 32], 2 +# [1, 1], [4, 16], [5, 32], 3 +# [5, 1], [6, 16], [8, 32], 4 (*) +# [6, 1], [6, 16], [8, 32], 5 (*) +# [1, 1], [1, 16], [8, 32], 6 +# [1, 33], [1, 33], [1, 33], 7 (static shapes) +# [3, 33], [3, 33], [3, 33], 8 (static shapes) +# [5, 33], [5, 33], [5, 33], 9 (static shapes) +rm -rf ${DATADIR} && rm -f config.pbtxt && mkdir -p ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_float32_float32_float32 ${DATADIR}/ + +# Keep a copy of original model config for different modifications +cp -r /data/inferenceserver/${REPO_VERSION}/qa_variable_model_repository/plan_float32_float32_float32/config.pbtxt . + +# TrtDynamicShapeTest.test_load_specific_optimization_profile +CLIENT_LOG="./test_load_specific_optimization_profile.client.log" +SERVER_LOG="./test_load_specific_optimization_profile.inference_server.log" +cp config.pbtxt ${DATADIR}/plan_float32_float32_float32/config.pbtxt && \ +sed -i "s/profile:.*/profile: [\"5\"]/" ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_OP_TEST TrtDynamicShapeTest.test_load_specific_optimization_profile >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# TrtDynamicShapeTest.test_load_default_optimization_profile +CLIENT_LOG="./test_load_default_optimization_profile.client.log" +SERVER_LOG="./test_load_default_optimization_profile.inference_server.log" +cp config.pbtxt ${DATADIR}/plan_float32_float32_float32/config.pbtxt && \ +sed -i "s/profile:.*//" ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_OP_TEST TrtDynamicShapeTest.test_load_default_optimization_profile >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# TrtDynamicShapeTest.test_select_optimization_profile +# Note that this test needs to check server log for which OP is used +# +# finding OP that best fit the input shape: +# load OP 0, 1, 2, 3, send [4 16] and 3 should be used +SERVER_ARGS="--model-repository=$DATADIR --log-verbose=1" +CLIENT_LOG="./test_select_optimization_profile.client.best.log" +SERVER_LOG="./test_select_optimization_profile.inference_server.best.log" +(cp config.pbtxt ${DATADIR}/plan_float32_float32_float32/config.pbtxt && \ + sed -i "s/max_batch_size:.*/max_batch_size: 5/" ${DATADIR}/plan_float32_float32_float32/config.pbtxt && \ + sed -i "s/profile:.*/profile: [\"0\", \"1\", \"2\", \"3\"]/" ${DATADIR}/plan_float32_float32_float32/config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_OP_TEST TrtDynamicShapeTest.test_select_optimization_profile >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +grep "Context with profile 3 \[3\] is being executed for " test_select_optimization_profile.inference_server.best.log +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected profile 3 is used\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# finding OP that best fit the input shape while the input shape is allowed: +# load OP 0, 5, send [4 16] and 0 should be used +# (OP 5 is the best in terms of OPT dims, but it requires min dims [6, 1]) +CLIENT_LOG="./test_select_optimization_profile.client.allow.log" +SERVER_LOG="./test_select_optimization_profile.inference_server.allow.log" +cp config.pbtxt ${DATADIR}/plan_float32_float32_float32/config.pbtxt && \ +sed -i "s/profile:.*/profile: [\"0\", \"5\"]/" ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_OP_TEST TrtDynamicShapeTest.test_select_optimization_profile >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +set +e +grep "Context with profile 0 \[0\] is being executed for " test_select_optimization_profile.inference_server.allow.log +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected profile 0 is used\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# TrtDynamicShapeTest.test_load_wrong_optimization_profile +SERVER_ARGS="--model-repository=$DATADIR --exit-on-error=false --strict-readiness=false" +CLIENT_LOG="./test_load_wrong_optimization_profile.client.log" +SERVER_LOG="./test_load_wrong_optimization_profile.inference_server.log" +cp config.pbtxt ${DATADIR}/plan_float32_float32_float32/config.pbtxt && \ +sed -i "s/profile:.*/profile: [\"100\"]/" ${DATADIR}/plan_float32_float32_float32/config.pbtxt + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_OP_TEST TrtDynamicShapeTest.test_load_wrong_optimization_profile >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + + +# Adding test cases for multiple optimization profiles with static shapes. +# Will load only the following profiles with the static shapes: +# Profile 7: [1, 33] +# Profile 8: [3, 33] +# Profile 9: [5, 33] +(cd ${DATADIR}/plan_float32_float32_float32/ && \ + rm -f config.pbtxt && \ + echo "instance_group { profile : [\"7\", \"8\", \"9\" ] }" >> config.pbtxt) +SERVER_ARGS="--model-repository=$DATADIR --strict-model-config=false" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +# Shape beyond the limits of optimization profile +set +e +$PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32 --shape INPUT0:33 --shape INPUT1:33 -t 1 -p2000 -b 5 > ${CLIENT_LOG}_static_pass 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}_static_pass + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32 --shape INPUT0:33 --shape INPUT1:33 -t 1 -p2000 -b 6 > ${CLIENT_LOG}_static_fail 2>&1 +if [ $? -eq 0 ]; then + ${CLIENT_LOG}_static_fail + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat ${CLIENT_LOG}_static_fail | grep "inference request batch-size must be <= 5" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}_static_fail + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +$PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32 --shape INPUT0:33 --shape INPUT1:33 -t 1 -p2000 -b 2 > ${CLIENT_LOG}_static_bs_2 2>&1 +if [ $? -eq 0 ]; then + ${CLIENT_LOG}_static_bs_2 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +if [ $(cat ${CLIENT_LOG}_static_bs_2 | grep "model expected the shape of dimension 0 to be between 1 and 1 but received 2" | wc -l) -eq 0 ]; then + cat ${CLIENT_LOG}_static_bs_2 + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Tests for multiple optimization profile with static shapes and dynamic batching. +# Profile 10: [1, 1], [1, 16], [1, 33] +# Profile 11: [2, 1], [2, 16], [2, 33] +# Profile 12: [3, 1], [3, 16], [3, 33] +# Profile 13: [4, 1], [4, 16], [4, 33] +# Profile 14: [5, 1], [5, 16], [5, 33] +# Profile 15: [6, 1], [6, 16], [6, 33] +# Profile 16: [7, 1], [7, 16], [7, 33] +# Profile 17: [8, 1], [8, 16], [8, 33] + +(cd ${DATADIR}/plan_float32_float32_float32/ && \ + rm -f config.pbtxt && \ + echo "instance_group { profile : [" >> config.pbtxt && \ + for i in {10..16}; do echo "\"${i}\"," >> config.pbtxt; done && \ + echo " \"17\"] }" >> config.pbtxt && \ + echo "dynamic_batching {}" >> config.pbtxt) + +SERVER_ARGS="--model-repository=$DATADIR --strict-model-config=false" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +$PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32 --shape INPUT0:33 --shape INPUT1:33 -t 16 -p2000 > ${CLIENT_LOG}_db_pass 2>&1 +if [ $? -ne 0 ]; then + cat ${CLIENT_LOG}_db_pass + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py b/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py new file mode 100755 index 0000000000..d9f890d9b6 --- /dev/null +++ b/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest + +import infer_util as iu +import numpy as np +import test_util as tu +import tritonhttpclient +from tritonclientutils import InferenceServerException + + +class TrtDynamicShapeTest(tu.TestResultCollector): + def setUp(self): + self.dtype_ = np.float32 + self.model_name_ = "plan" + + def test_load_specific_optimization_profile(self): + # Only OP 5 should be available, which only allow batch size 8 + tensor_shape = (1,) + try: + iu.infer_exact( + self, + self.model_name_, + (1,) + tensor_shape, + 1, + self.dtype_, + self.dtype_, + self.dtype_, + ) + except InferenceServerException as ex: + self.assertTrue( + "model expected the shape of dimension 0 to be between 6 and 8 but received 1" + in ex.message() + ) + + try: + iu.infer_exact( + self, + self.model_name_, + (8,) + tensor_shape, + 8, + self.dtype_, + self.dtype_, + self.dtype_, + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_load_default_optimization_profile(self): + # Only default OP (OP 0) has max tensor shape 33 + tensor_shape = (33,) + + try: + iu.infer_exact( + self, + self.model_name_, + (8,) + tensor_shape, + 8, + self.dtype_, + self.dtype_, + self.dtype_, + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + over_tensor_shape = (34,) + try: + iu.infer_exact( + self, + self.model_name_, + (8,) + over_tensor_shape, + 8, + self.dtype_, + self.dtype_, + self.dtype_, + ) + except InferenceServerException as ex: + self.assertTrue( + "model expected the shape of dimension 1 to be between 1 and 33 but received 34" + in ex.message() + ) + + def test_select_optimization_profile(self): + # Different profile has different optimized input shape + batch_size = 4 + tensor_shape = (16,) + try: + iu.infer_exact( + self, + self.model_name_, + (batch_size,) + tensor_shape, + batch_size, + self.dtype_, + self.dtype_, + self.dtype_, + ) + except InferenceServerException as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_load_wrong_optimization_profile(self): + client = tritonhttpclient.InferenceServerClient("localhost:8000") + model_name = tu.get_model_name( + self.model_name_, self.dtype_, self.dtype_, self.dtype_ + ) + model_status = client.is_model_ready(model_name, "1") + self.assertFalse(model_status, "expected model to be not ready") + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_error_propagation/test.sh b/qa/L0_trt_error_propagation/test.sh new file mode 100755 index 0000000000..dac3f6349e --- /dev/null +++ b/qa/L0_trt_error_propagation/test.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +export CUDA_VISIBLE_DEVICES=0 +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +# Create TensorRT model with invalid plan file +rm -rf models && mkdir models +mkdir models/invalid_plan_file && (cd models/invalid_plan_file && \ + echo -e "name: \"invalid_plan_file\"" >> config.pbtxt && \ + echo -e "platform: \"tensorrt_plan\"" >> config.pbtxt && \ + echo -e "input [\n {\n name: \"INPUT\"\n data_type: TYPE_FP32\n dims: [-1]\n }\n ]" >> config.pbtxt && \ + echo -e "output [\n {\n name: \"OUTPUT\"\n data_type: TYPE_FP32\n dims: [-1]\n }\n ]" >> config.pbtxt && \ + mkdir 1 && echo "----- invalid model.plan -----" >> 1/model.plan) + +# Test with and without auto complete enabled +for ENABLE_AUTOCOMPLETE in "YES" "NO"; do + + if [[ "$ENABLE_AUTOCOMPLETE" == "YES" ]]; then + TEST_NAME="test_invalid_trt_model_autocomplete" + SERVER_ARGS="--model-repository=models --model-control-mode=explicit" + else + TEST_NAME="test_invalid_trt_model" + SERVER_ARGS="--model-repository=models --model-control-mode=explicit --disable-auto-complete-config" + fi + + SERVER_LOG="./$TEST_NAME.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + RET=0 + + set +e + python trt_error_propagation_test.py TestTrtErrorPropagation.$TEST_NAME > $TEST_NAME.log 2>&1 + if [ $? -ne 0 ]; then + cat $TEST_NAME.log + echo -e "\n***\n*** Test FAILED\n***" + RET=1 + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + + if [ $RET -ne 0 ]; then + exit $RET + fi + +done + +# Exit with success +echo -e "\n***\n*** Test Passed\n***" +exit 0 diff --git a/qa/L0_trt_error_propagation/trt_error_propagation_test.py b/qa/L0_trt_error_propagation/trt_error_propagation_test.py new file mode 100755 index 0000000000..83527a7533 --- /dev/null +++ b/qa/L0_trt_error_propagation/trt_error_propagation_test.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class TestTrtErrorPropagation(unittest.TestCase): + def setUp(self): + # Initialize client + self.__triton = grpcclient.InferenceServerClient("localhost:8001", verbose=True) + + def test_invalid_trt_model(self): + with self.assertRaises(InferenceServerException) as cm: + self.__triton.load_model("invalid_plan_file") + err_msg = str(cm.exception) + # All 'expected_msg_parts' should be present in the 'err_msg' in order + expected_msg_parts = [ + "load failed for model", + "version 1 is at UNAVAILABLE state: ", + "Internal: unable to create TensorRT engine: ", + "Error Code ", + "Internal Error ", + ] + for expected_msg_part in expected_msg_parts: + self.assertIn( + expected_msg_part, + err_msg, + "Cannot find an expected part of error message", + ) + _, err_msg = err_msg.split(expected_msg_part) + + def test_invalid_trt_model_autocomplete(self): + with self.assertRaises(InferenceServerException) as cm: + self.__triton.load_model("invalid_plan_file") + err_msg = str(cm.exception) + self.assertIn( + "Internal: unable to load plan file to auto complete config", + err_msg, + "Caught an unexpected exception", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_plugin/test.sh b/qa/L0_trt_plugin/test.sh new file mode 100755 index 0000000000..a9d04331f0 --- /dev/null +++ b/qa/L0_trt_plugin/test.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +PLUGIN_TEST=trt_plugin_test.py + +# On windows the paths invoked by the script (running in WSL) must use +# /mnt/c when needed but the paths on the tritonserver command-line +# must be C:/ style. +if [[ -v WSL_DISTRO_NAME ]] || [[ -v MSYSTEM ]]; then + DATADIR=${DATADIR:="/mnt/c/data/inferenceserver/${REPO_VERSION}"} + MODELDIR=${MODELDIR:=C:/models} + CUSTOMPLUGIN=${CUSTOMPLUGIN:=$MODELDIR/HardmaxPlugin.dll} + BACKEND_DIR=${BACKEND_DIR:=C:/tritonserver/backends} + SERVER=${SERVER:=/mnt/c/tritonserver/bin/tritonserver.exe} +else + DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + MODELDIR=${MODELDIR:=`pwd`/models} + CUSTOMPLUGIN=${CUSTOMPLUGIN:=$MODELDIR/libcustomHardmaxPlugin.so} + TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} + BACKEND_DIR=${TRITON_DIR}/backends + SERVER=${TRITON_DIR}/bin/tritonserver +fi + +source ../common/util.sh + +RET=0 +rm -f ./*.log + +SERVER_ARGS_BASE="--model-repository=${MODELDIR} --backend-directory=${BACKEND_DIR} --log-verbose=1" +SERVER_TIMEOUT=20 + +LOG_IDX=0 + +## Custom Plugin Tests + +## Create model folder with custom plugin models +rm -fr models && mkdir -p models +find $DATADIR/qa_trt_plugin_model_repository/ -maxdepth 1 -iname '*Hardmax*' -exec cp -r {} models \; + +LOG_IDX=$((LOG_IDX+1)) + +## Baseline Failure Test +## Plugin library not loaded +SERVER_ARGS=$SERVER_ARGS_BASE +SERVER_LOG="./inference_server_$LOG_IDX.log" + +run_server +if [ "$SERVER_PID" != "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed\n" + echo -e "Unexpected successful server start $SERVER\n***" + kill_server + exit 1 +fi + +LOG_IDX=$((LOG_IDX+1)) + +## Backend Config, Plugin Test +SERVER_ARGS="${SERVER_ARGS_BASE} --backend-config=tensorrt,plugins=${CUSTOMPLUGIN}" +SERVER_LOG="./inference_server_$LOG_IDX.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +rm -f $CLIENT_LOG +set +e +python3 $PLUGIN_TEST PluginModelTest.test_raw_hard_max >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill_server + +LOG_IDX=$((LOG_IDX+1)) + +## LD_PRELOAD, Plugin Test +## LD_PRELOAD is only on Linux + +SERVER_LD_PRELOAD=$CUSTOMPLUGIN +SERVER_ARGS=$SERVER_ARGS_BASE +SERVER_LOG="./inference_server_$LOG_IDX.log" + +if [[ ! -v WSL_DISTRO_NAME ]] || [[ ! -v MSYSTEM ]]; then + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + rm -f $CLIENT_LOG + set +e + python3 $PLUGIN_TEST PluginModelTest.test_raw_hard_max >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill_server +fi + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET diff --git a/qa/L0_trt_plugin/trt_plugin_test.py b/qa/L0_trt_plugin/trt_plugin_test.py new file mode 100755 index 0000000000..7cfb098519 --- /dev/null +++ b/qa/L0_trt_plugin/trt_plugin_test.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import unittest + +import numpy as np +import test_util as tu +import tritonclient.http as httpclient + +# By default, find tritonserver on "localhost", but can be overridden +# with TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") + + +def hardmax_reference(arr, axis=0): + one_hot = np.zeros(arr.shape, dtype=arr.dtype) + argmax = np.expand_dims(np.argmax(arr, axis), axis) + np.put_along_axis(one_hot, argmax, 1, axis=axis) + return one_hot + + +class PluginModelTest(tu.TestResultCollector): + def _full_exact(self, model_name, plugin_name, shape): + print(f"{_tritonserver_ipaddr}:8000") + triton_client = httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000") + + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", list(shape), "FP32")) + + input0_data = np.ones(shape=shape).astype(np.float32) + inputs[0].set_data_from_numpy(input0_data, binary_data=True) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + + results = triton_client.infer( + model_name + "_" + plugin_name, inputs, outputs=outputs + ) + + output0_data = results.as_numpy("OUTPUT0") + tolerance_relative = 1e-6 + tolerance_absolute = 1e-7 + + # Verify values of Hardmax, GELU, and Normalize + if plugin_name == "CustomHardmax": + test_output = hardmax_reference(input0_data) + np.testing.assert_allclose( + output0_data, + test_output, + rtol=tolerance_relative, + atol=tolerance_absolute, + ) + else: + self.fail("Unexpected plugin: " + plugin_name) + + def test_raw_hard_max(self): + for bs in (1, 8): + self._full_exact( + "plan_float32_float32_float32", + "CustomHardmax", + (bs, 2, 2), + ) + + self._full_exact( + "plan_nobatch_float32_float32_float32", + "CustomHardmax", + (16, 1, 1), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_reformat_free/test.sh b/qa/L0_trt_reformat_free/test.sh new file mode 100755 index 0000000000..2daf2f0648 --- /dev/null +++ b/qa/L0_trt_reformat_free/test.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' +export CUDA_VISIBLE_DEVICES=0 + +CLIENT_LOG="./client.log" +TRT_TEST=trt_reformat_free_test.py + +DATADIR="./models" + +rm -rf ${DATADIR} +cp -r /data/inferenceserver/${REPO_VERSION}/qa_trt_format_model_repository/ ${DATADIR} + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR" +source ../common/util.sh + +rm -f *.log* + +RET=0 + +# TrtReformatFreeTest +CLIENT_LOG="./test_reformat_free.client.log" +SERVER_LOG="./test_reformat_free.inference_server.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $TRT_TEST TrtReformatFreeTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE 6 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET diff --git a/qa/L0_trt_reformat_free/trt_reformat_free_test.py b/qa/L0_trt_reformat_free/trt_reformat_free_test.py new file mode 100755 index 0000000000..c6a911783e --- /dev/null +++ b/qa/L0_trt_reformat_free/trt_reformat_free_test.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import unittest +from builtins import range + +import numpy as np +import test_util as tu +import tritonclient.http as tritonhttpclient +import tritonclient.utils.shared_memory as shm +from tritonclient.utils import InferenceServerException + + +def div_up(a, b): + return (a + b - 1) // b + + +def reformat(format, tensor_np): + if format == "CHW2": + factor = 2 + elif format == "CHW32": + factor = 32 + else: + raise ValueError( + "Unexpected format {} for testing reformat-free input".format(format) + ) + shape = list(tensor_np.shape) + [factor] + shape[-4] = div_up(shape[-4], factor) + reformatted_tensor_np = np.empty(shape, tensor_np.dtype) + if len(tensor_np.shape) == 3: + batch = [(tensor_np, reformatted_tensor_np)] + elif len(tensor_np.shape) == 4: + batch = [ + (tensor_np[idx], reformatted_tensor_np[idx]) + for idx in range(tensor_np.shape[0]) + ] + else: + raise ValueError( + "Unexpected numpy shape {} for testing reformat-free input".format( + tensor_np.shape + ) + ) + for tensor, reformatted_tensor in batch: + for c in range(tensor.shape[0]): + for h in range(tensor.shape[1]): + for w in range(tensor.shape[2]): + reformatted_tensor[c // factor][h][w][c % factor] = tensor[c][h][w] + return reformatted_tensor_np + + +class TrtReformatFreeTest(tu.TestResultCollector): + def add_reformat_free_data_as_shared_memory(self, name, tensor, tensor_np): + byte_size = tensor_np.size * tensor_np.dtype.itemsize + self.shm_handles.append(shm.create_shared_memory_region(name, name, byte_size)) + # Put data values into shared memory + shm.set_shared_memory_region(self.shm_handles[-1], [tensor_np]) + # Register shared memory with Triton Server + self.triton_client.register_system_shared_memory(name, name, byte_size) + # Set the parameters to use data from shared memory + tensor.set_shared_memory(name, byte_size) + + def setUp(self): + self.shm_handles = [] + self.triton_client = tritonhttpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) + + def tearDown(self): + self.triton_client.unregister_system_shared_memory() + for handle in self.shm_handles: + shm.destroy_shared_memory_region(handle) + + def test_nobatch_chw2_input(self): + model_name = "plan_nobatch_CHW2_LINEAR_float16_float16_float16" + input_np = np.arange(26, dtype=np.float16).reshape((13, 2, 1)) + expected_output0_np = input_np + input_np + expected_output1_np = input_np - input_np + reformatted_input_np = reformat("CHW2", input_np) + + # Use shared memory to bypass the shape check in client library, because + # for non-linear format tensor, the data buffer is padded and thus the + # data byte size may not match what is calculated from tensor shape + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT0", [13, 2, 1], "FP16")) + self.add_reformat_free_data_as_shared_memory( + "input0", inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [13, 2, 1], "FP16")) + self.add_reformat_free_data_as_shared_memory( + "input1", inputs[-1], reformatted_input_np + ) + + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + # Validate the results by comparing with precomputed values. + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") + self.assertTrue( + np.array_equal(output0_np, expected_output0_np), + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) + self.assertTrue( + np.array_equal(output1_np, expected_output1_np), + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) + + def test_wrong_nobatch_chw2_input(self): + model_name = "plan_nobatch_CHW2_LINEAR_float16_float16_float16" + input_np = np.arange(26, dtype=np.float16).reshape((13, 2, 1)) + + # Use shared memory to bypass the shape check in client library, because + # for non-linear format tensor, the data buffer is padded and thus the + # data byte size may not match what is calculated from tensor shape + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT0", [13, 2, 1], "FP16")) + # Send the original size input instead of the reformatted size input. + self.add_reformat_free_data_as_shared_memory("input0", inputs[-1], input_np) + + inputs.append(tritonhttpclient.InferInput("INPUT1", [13, 2, 1], "FP16")) + # Send the original size input instead of the reformatted size input. + self.add_reformat_free_data_as_shared_memory("input1", inputs[-1], input_np) + + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + + with self.assertRaises(InferenceServerException) as e: + self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + + err_str = str(e.exception) + self.assertIn( + "input byte size mismatch for input 'INPUT0' for model 'plan_nobatch_CHW2_LINEAR_float16_float16_float16'. Expected 56, got 52", + err_str, + ) + + def test_chw2_input(self): + model_name = "plan_CHW2_LINEAR_float16_float16_float16" + for bs in [1, 8]: + input_np = np.arange(26 * bs, dtype=np.float16).reshape((bs, 13, 2, 1)) + expected_output0_np = input_np + input_np + expected_output1_np = input_np - input_np + reformatted_input_np = reformat("CHW2", input_np) + + # Use shared memory to bypass the shape check in client library, + # because for non-linear format tensor, the data buffer is padded + # and thus the data byte size may not match what is calculated from + # tensor shape + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT0", [bs, 13, 2, 1], "FP16")) + self.add_reformat_free_data_as_shared_memory( + "input0" + str(bs), inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [bs, 13, 2, 1], "FP16")) + self.add_reformat_free_data_as_shared_memory( + "input1" + str(bs), inputs[-1], reformatted_input_np + ) + + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + # Validate the results by comparing with precomputed values. + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") + self.assertTrue( + np.array_equal(output0_np, expected_output0_np), + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) + self.assertTrue( + np.array_equal(output1_np, expected_output1_np), + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) + + def test_wrong_chw2_input(self): + model_name = "plan_CHW2_LINEAR_float16_float16_float16" + for bs in [1, 8]: + input_np = np.arange(26 * bs, dtype=np.float16).reshape((bs, 13, 2, 1)) + + # Use shared memory to bypass the shape check in client library, + # because for non-linear format tensor, the data buffer is padded + # and thus the data byte size may not match what is calculated from + # tensor shape + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT0", [bs, 13, 2, 1], "FP16")) + # Send the original size input instead of the reformatted size input. + self.add_reformat_free_data_as_shared_memory( + "input0" + str(bs), inputs[-1], input_np + ) + + inputs.append(tritonhttpclient.InferInput("INPUT1", [bs, 13, 2, 1], "FP16")) + # Send the original size input instead of the reformatted size input. + self.add_reformat_free_data_as_shared_memory( + "input1" + str(bs), inputs[-1], input_np + ) + + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + + with self.assertRaises(InferenceServerException) as e: + self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + err_str = str(e.exception) + # reformatted input size - (bs, 14, 2, 1) * size(float16) + expected_size = bs * 28 * 2 + # original input size - (bs, 13, 2, 1) * size(float16) + received_size = bs * 26 * 2 + self.assertIn( + f"input byte size mismatch for input 'INPUT0' for model 'plan_CHW2_LINEAR_float16_float16_float16'. Expected {expected_size}, got {received_size}", + err_str, + ) + + def test_nobatch_chw32_input(self): + model_name = "plan_nobatch_CHW32_LINEAR_float32_float32_float32" + input_np = np.arange(26, dtype=np.float32).reshape((13, 2, 1)) + expected_output0_np = input_np + input_np + expected_output1_np = input_np - input_np + reformatted_input_np = reformat("CHW32", input_np) + + # Use shared memory to bypass the shape check in client library, because + # for non-linear format tensor, the data buffer is padded and thus the + # data byte size may not match what is calculated from tensor shape + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT0", [13, 2, 1], "FP32")) + self.add_reformat_free_data_as_shared_memory( + "input0", inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [13, 2, 1], "FP32")) + self.add_reformat_free_data_as_shared_memory( + "input1", inputs[-1], reformatted_input_np + ) + + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + # Validate the results by comparing with precomputed values. + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") + self.assertTrue( + np.array_equal(output0_np, expected_output0_np), + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) + self.assertTrue( + np.array_equal(output1_np, expected_output1_np), + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) + + def test_chw32_input(self): + model_name = "plan_CHW32_LINEAR_float32_float32_float32" + for bs in [1, 8]: + input_np = np.arange(26 * bs, dtype=np.float32).reshape((bs, 13, 2, 1)) + expected_output0_np = input_np + input_np + expected_output1_np = input_np - input_np + reformatted_input_np = reformat("CHW32", input_np) + + # Use shared memory to bypass the shape check in client library, + # because for non-linear format tensor, the data buffer is padded + # and thus the data byte size may not match what is calculated from + # tensor shape + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT0", [bs, 13, 2, 1], "FP32")) + self.add_reformat_free_data_as_shared_memory( + "input0" + str(bs), inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [bs, 13, 2, 1], "FP32")) + self.add_reformat_free_data_as_shared_memory( + "input1" + str(bs), inputs[-1], reformatted_input_np + ) + + outputs = [] + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) + outputs.append( + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) + + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + # Validate the results by comparing with precomputed values. + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") + self.assertTrue( + np.array_equal(output0_np, expected_output0_np), + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) + self.assertTrue( + np.array_equal(output1_np, expected_output1_np), + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_trt_shape_tensors/test.sh b/qa/L0_trt_shape_tensors/test.sh new file mode 100755 index 0000000000..548ebb55af --- /dev/null +++ b/qa/L0_trt_shape_tensors/test.sh @@ -0,0 +1,257 @@ +#!/bin/bash +# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +TEST_RESULT_FILE='test_results.txt' +CLIENT_LOG="./client.log" +SHAPE_TENSOR_TEST=trt_shape_tensor_test.py + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +rm -fr *.log +rm -fr models && mkdir models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_shapetensor_model_repository/* models/. + +RET=0 + +# Must run on a single device or else the TRITONSERVER_DELAY_SCHEDULER +# can fail when the requests are distributed to multiple devices. +export CUDA_VISIBLE_DEVICES=0 + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +# python unittest seems to swallow ImportError and still return 0 +# exit code. So need to explicitly check CLIENT_LOG to make sure +# we see some running tests + +# Sanity tests +python $SHAPE_TENSOR_TEST InferShapeTensorTest.test_static_batch >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +python $SHAPE_TENSOR_TEST InferShapeTensorTest.test_nobatch >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +python $SHAPE_TENSOR_TEST InferShapeTensorTest.test_wrong_shape_values >$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n*** Sanity Test Passed*** \n" +else + exit $RET +fi + +# Prepare the config file for dynamic batching tests +for dtype in int32 int64; do + CONFIG_FILE="models/plan_zero_1_float32_${dtype}/config.pbtxt" + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" "$CONFIG_FILE" + sed -i "s/^version_policy:.*/version_policy: { specific { versions: [1] }}/" "$CONFIG_FILE" + echo "dynamic_batching { preferred_batch_size: [ 2, 6 ], max_queue_delay_microseconds: 10000000 }" >>"$CONFIG_FILE" +done + +for i in \ + test_dynamic_different_shape_values \ + test_dynamic_identical_shape_values; do + SERVER_LOG="./$i.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, $model_type" >>$CLIENT_LOG + + set +e + python $SHAPE_TENSOR_TEST InferShapeTensorTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test Failed $i\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID + done + +for i in \ + test_sequence_different_shape_values \ + test_sequence_identical_shape_values ; do + export TRITONSERVER_BACKLOG_DELAY_SCHEDULER=0 + export TRITONSERVER_DELAY_SCHEDULER=12 + SERVER_LOG="./$i.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i, $model_type" >>$CLIENT_LOG + + set +e + python $SHAPE_TENSOR_TEST SequenceBatcherShapeTensorTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test Failed $i\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + unset TRITONSERVER_DELAY_SCHEDULER + unset TRITONSERVER_BACKLOG_DELAY_SCHEDULER + kill $SERVER_PID + wait $SERVER_PID + done + +# Prepare the config file for dynamic sequence batching tests +for dtype in int32 int64; do + CONFIG_FILE="models/plan_dyna_sequence_float32_${dtype}/config.pbtxt" + sed -i "s/max_candidate_sequences:.*/max_candidate_sequences:4/" "$CONFIG_FILE" + sed -i "s/max_queue_delay_microseconds:.*/max_queue_delay_microseconds:5000000/" "$CONFIG_FILE" +done + +export NO_BATCHING=0 + +for i in \ + test_dynaseq_identical_shape_values_series \ + test_dynaseq_identical_shape_values_parallel \ + test_dynaseq_different_shape_values_series \ + test_dynaseq_different_shape_values_parallel \ + ;do + SERVER_ARGS="--model-repository=`pwd`/models" + SERVER_LOG="./$i.server.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $SHAPE_TENSOR_TEST DynaSequenceBatcherTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + + kill $SERVER_PID + wait $SERVER_PID +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +fi + +exit $RET diff --git a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py new file mode 100755 index 0000000000..551ee2f8c0 --- /dev/null +++ b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py @@ -0,0 +1,1122 @@ +#!/usr/bin/env python3 + +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import threading +import time +import unittest +from builtins import range + +import infer_util as iu +import numpy as np +import sequence_util as su +import test_util as tu +import tritonclient.grpc as grpcclient + +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) + +_model_instances = 1 +_max_queue_delay_ms = 10000 +_max_sequence_idle_ms = 5000 + +_deferred_exceptions_lock = threading.Lock() +_deferred_exceptions = [] + + +class InferShapeTensorTest(tu.TestResultCollector): + def setUp(self): + # The helper client for setup will be GRPC for simplicity. + self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001") + global _deferred_exceptions + _deferred_exceptions = [] + + def tearDown(self): + self.triton_client_.unregister_system_shared_memory() + self.triton_client_.unregister_cuda_shared_memory() + super().tearDown() + + def add_deferred_exception(self, ex): + global _deferred_exceptions + with _deferred_exceptions_lock: + _deferred_exceptions.append(ex) + + def check_deferred_exception(self): + # Just raise one of the exceptions... + with _deferred_exceptions_lock: + if len(_deferred_exceptions) > 0: + raise _deferred_exceptions[0] + + def check_response( + self, + bs, + thresholds, + shape_values, + dummy_input_shapes, + shm_region_names=None, + precreated_shm_regions=None, + shm_suffix="", + shape_tensor_input_dtype=np.int32, + ): + try: + # Add batch size to shape as full shape is expected + for i in range(len(dummy_input_shapes)): + dummy_input_shapes[i] = [ + bs, + ] + dummy_input_shapes[i] + start_ms = int(round(time.time() * 1000)) + + iu.infer_shape_tensor( + self, + "plan", + np.float32, + shape_values, + dummy_input_shapes, + use_grpc=False, + use_streaming=False, + shm_suffix=shm_suffix, + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + batch_size=bs, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + + end_ms = int(round(time.time() * 1000)) + + lt_ms = thresholds[0] + gt_ms = thresholds[1] + if lt_ms is not None: + self.assertTrue( + (end_ms - start_ms) < lt_ms, + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + if gt_ms is not None: + self.assertTrue( + (end_ms - start_ms) > gt_ms, + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) + except Exception as ex: + self.add_deferred_exception(ex) + + def check_setup(self, model_name): + # Make sure test.sh set up the correct batcher settings + config = self.triton_client_.get_model_config(model_name).config + bconfig = config.dynamic_batching + self.assertTrue(2 in bconfig.preferred_batch_size) + self.assertTrue(6 in bconfig.preferred_batch_size) + self.assertEqual( + bconfig.max_queue_delay_microseconds, _max_queue_delay_ms * 1000 + ) # 10 secs + + def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt): + # There is a time window between when responses are returned and statistics are updated. + # To prevent intermittent test failure during that window, wait up to 10 seconds for the + # inference statistics to be ready. + num_tries = 10 + for i in range(num_tries): + stats = self.triton_client_.get_inference_statistics(model_name, "1") + self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") + actual_exec_cnt = stats.model_stats[0].execution_count + if actual_exec_cnt == exec_cnt: + break + print( + "WARNING: expect {} executions, got {} (attempt {})".format( + exec_cnt, actual_exec_cnt, i + ) + ) + time.sleep(1) + + self.assertEqual( + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) + + if batch_exec is not None: + batch_stats = stats.model_stats[0].batch_stats + print(batch_stats) + self.assertEqual( + len(batch_stats), + len(batch_exec), + "expected {} different batch-sizes, got {}".format( + len(batch_exec), len(batch_stats) + ), + ) + + for batch_stat in batch_stats: + bs = batch_stat.batch_size + bc = batch_stat.compute_infer.count + self.assertTrue( + bs in batch_exec, "did not find expected batch-size {}".format(bs) + ) + # Get count from one of the stats + self.assertEqual( + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) + + actual_exec_cnt = stats.model_stats[0].execution_count + self.assertEqual( + actual_exec_cnt, + exec_cnt, + "expected model-exec-count {}, got {}".format(exec_cnt, actual_exec_cnt), + ) + + actual_infer_cnt = stats.model_stats[0].inference_count + self.assertEqual( + actual_infer_cnt, + infer_cnt, + "expected model-inference-count {}, got {}".format( + infer_cnt, actual_infer_cnt + ), + ) + + actual_infer_cnt = stats.model_stats[0].inference_count + self.assertEqual( + actual_infer_cnt, + infer_cnt, + "expected model-inference-count {}, got {}".format( + infer_cnt, actual_infer_cnt + ), + ) + + def test_static_batch(self): + for shape_tensor_input_dtype in [np.int32, np.int64]: + iu.infer_shape_tensor( + self, + "plan", + np.float32, + [[32, 32]], + [[8, 4, 4]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + batch_size=8, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + iu.infer_shape_tensor( + self, + "plan", + np.float32, + [[4, 4]], + [[8, 32, 32]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + batch_size=8, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + iu.infer_shape_tensor( + self, + "plan", + np.float32, + [[4, 4]], + [[8, 4, 4]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + batch_size=8, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + + def test_nobatch(self): + for shape_tensor_input_dtype in [np.int32, np.int64]: + iu.infer_shape_tensor( + self, + "plan_nobatch", + np.float32, + [[32, 32]], + [[4, 4]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + iu.infer_shape_tensor( + self, + "plan_nobatch", + np.float32, + [[4, 4]], + [[32, 32]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + iu.infer_shape_tensor( + self, + "plan_nobatch", + np.float32, + [[4, 4]], + [[4, 4]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + + def test_wrong_shape_values(self): + over_shape_values = [[32, 33]] + for shape_tensor_input_dtype in [np.int32, np.int64]: + try: + iu.infer_shape_tensor( + self, + "plan", + np.float32, + over_shape_values, + [[8, 4, 4]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + batch_size=8, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + # InferenceServerException will be raised from different namespace, + # use dynamic type characteristic to catch both ex + except Exception as ex: + self.assertIn( + "The shape value at index 2 is expected to be in range from 1 to 32, Got: 33", + ex.message(), + ) + + # Dynamic Batcher tests + def test_dynamic_different_shape_values(self): + # Send two requests with sum of static batch sizes == + # preferred size, but with different shape values. This + # should cause the requests to not be batched. The first + # response will come back immediately and the second + # delayed by the max batch queue delay + for shape_tensor_input_dtype in [np.int32, np.int64]: + try: + model_name = tu.get_zero_model_name("plan", 1, np.float32) + model_name = model_name + "_" + np.dtype(shape_tensor_input_dtype).name + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(3, (6000, None)), + kwargs={ + "shape_values": [[2, 2]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(3, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)), + kwargs={ + "shape_values": [[4, 4]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {3: 2}, 2, 6) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + def test_dynamic_identical_shape_values(self): + # Send two requests with sum of static batch sizes == + # preferred size, but with identical shape values. This + # should cause the requests to get batched. Both + # responses should come back immediately. + for shape_tensor_input_dtype in [np.int32, np.int64]: + try: + model_name = tu.get_zero_model_name("plan", 1, np.float32) + model_name = model_name + "_" + np.dtype(shape_tensor_input_dtype).name + + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_response, + args=(4, (6000, None)), + kwargs={ + "shape_values": [[4, 4]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(2, (6000, None)), + kwargs={ + "shape_values": [[4, 4]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads[0].start() + time.sleep(1) + threads[1].start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {6: 1}, 1, 6) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + +class SequenceBatcherShapeTensorTest(su.SequenceBatcherTestUtil): + def get_expected_result(self, expected_result, value, flag_str=None): + # Adjust the expected_result for models + expected_result = value + if (flag_str is not None) and ("start" in flag_str): + expected_result += 1 + return expected_result + + def test_sequence_identical_shape_values(self): + # Test model instances together are configured with + # total-batch-size 4. Send four equal-length sequences + # with identical shape values in parallel and make sure + # they get completely batched into batch-size 4 + # inferences. + self.clear_deferred_exceptions() + dtype = np.float32 + for shape_tensor_input_dtype in [np.int32, np.int64]: + try: + model_name = tu.get_sequence_model_name("plan", dtype) + model_name = model_name + "_" + np.dtype(shape_tensor_input_dtype).name + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + precreated_shm0_handles = self.precreate_register_shape_tensor_regions( + value_list=((2, 1), (4, 2), (8, 3)), + dtype=dtype, + i=0, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + precreated_shm1_handles = self.precreate_register_shape_tensor_regions( + value_list=((2, 11), (4, 12), (8, 13)), + dtype=dtype, + i=1, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + precreated_shm2_handles = self.precreate_register_shape_tensor_regions( + value_list=((2, 111), (4, 112), (8, 113)), + dtype=dtype, + i=2, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + precreated_shm3_handles = self.precreate_register_shape_tensor_regions( + value_list=((2, 1111), (4, 1112), (8, 1113)), + dtype=dtype, + i=3, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1001, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 1, None), + (None, 4, 2, None), + ("end", 8, 3, None), + ), + self.get_expected_result(6, 3, "end"), + precreated_shm0_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1002, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 11, None), + (None, 4, 12, None), + ("end", 8, 13, None), + ), + self.get_expected_result(36, 13, "end"), + precreated_shm1_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1003, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 111, None), + (None, 4, 112, None), + ("end", 8, 113, None), + ), + self.get_expected_result(336, 113, "end"), + precreated_shm2_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1004, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 1111, None), + (None, 4, 1112, None), + ("end", 8, 1113, None), + ), + self.get_expected_result(3336, 1113, "end"), + precreated_shm3_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + + for t in threads: + t.start() + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {4: 3}, 3, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def test_sequence_different_shape_values(self): + # Test model instances together are configured with + # total-batch-size 4. Send four equal-length sequences with + # different shape values in 2 sequences and 2 sequences that + # share the same shape value. Make sure that the 2 sequences + # with same shapes batch together but other two sequences do + # not. + self.clear_deferred_exceptions() + dtype = np.float32 + + for shape_tensor_input_dtype in [np.int32, np.int64]: + precreated_shm0_handles = self.precreate_register_shape_tensor_regions( + value_list=((1, 1), (1, 2), (1, 3)), + dtype=dtype, + i=0, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + precreated_shm1_handles = self.precreate_register_shape_tensor_regions( + value_list=((32, 11), (32, 12), (32, 13)), + dtype=dtype, + i=1, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + precreated_shm2_handles = self.precreate_register_shape_tensor_regions( + value_list=((16, 111), (16, 112), (16, 113)), + dtype=dtype, + i=2, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + precreated_shm3_handles = self.precreate_register_shape_tensor_regions( + value_list=((1, 1111), (1, 1112), (1, 1113)), + dtype=dtype, + i=3, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + try: + model_name = tu.get_sequence_model_name("plan", dtype) + model_name = model_name + "_" + np.dtype(shape_tensor_input_dtype).name + self.check_setup(model_name) + + # Need scheduler to wait for queue to contain all + # inferences for both sequences. + self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + self.assertEqual( + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) + + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1001, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 1, 1, None), + (None, 1, 2, None), + ("end", 1, 3, None), + ), + self.get_expected_result(6, 3, "end"), + precreated_shm0_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1002, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 32, 11, None), + (None, 32, 12, None), + ("end", 32, 13, None), + ), + self.get_expected_result(36, 13, "end"), + precreated_shm1_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1003, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 16, 111, None), + (None, 16, 112, None), + ("end", 16, 113, None), + ), + self.get_expected_result(336, 113, "end"), + precreated_shm2_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + 1004, + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 1, 1111, None), + (None, 1, 1112, None), + ("end", 1, 1113, None), + ), + self.get_expected_result(3336, 1113, "end"), + precreated_shm3_handles, + ), + kwargs={ + "sequence_name": "{}".format(self._testMethodName), + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + + for t in threads: + t.start() + time.sleep(1) + for t in threads: + t.join() + + self.check_deferred_exception() + self.check_status(model_name, {4: 3, 3: 6}, 9, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + +class DynaSequenceBatcherTest(su.SequenceBatcherTestUtil): + def get_expected_result(self, expected_result, corrid, value, flag_str=None): + expected_result = value + if flag_str is not None: + if "start" in flag_str: + expected_result += 1 + if "end" in flag_str: + expected_result += corrid + return expected_result + + def _multi_sequence_different_shape_impl(self, sleep_secs): + self.clear_deferred_exceptions() + dtype = np.float32 + + for shape_tensor_input_dtype in [np.int32, np.int64]: + precreated_shm0_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((1, 1), (12, 2), (2, 3)), + dtype=dtype, + i=0, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + precreated_shm1_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((3, 11), (4, 12), (5, 13)), + dtype=dtype, + i=1, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + precreated_shm2_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((6, 111), (7, 112), (8, 113)), + dtype=dtype, + i=2, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + precreated_shm3_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((9, 1111), (10, 1112), (11, 1113)), + dtype=dtype, + i=3, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + + try: + model_name = tu.get_dyna_sequence_model_name("plan", dtype) + model_name = model_name + "_" + np.dtype(shape_tensor_input_dtype).name + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + corrids = [1001, 1002, 1003, 1004] + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 1, 1, None), + (None, 12, 2, None), + ("end", 2, 3, None), + ), + self.get_expected_result( + 4 + corrids[0], corrids[0], 3, "end" + ), + precreated_shm0_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[0] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 3, 11, None), + (None, 4, 12, None), + ("end", 5, 13, None), + ), + self.get_expected_result( + 36 + corrids[1], corrids[1], 13, "end" + ), + precreated_shm1_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[1] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 6, 111, None), + (None, 7, 112, None), + ("end", 8, 113, None), + ), + self.get_expected_result( + 336 + corrids[2], corrids[2], 113, "end" + ), + precreated_shm2_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[2] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 9, 1111, None), + (None, 10, 1112, None), + ("end", 11, 1113, None), + ), + self.get_expected_result( + 3336 + corrids[3], corrids[3], 1113, "end" + ), + precreated_shm3_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[3] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + + for t in threads: + t.start() + if sleep_secs > 0: + time.sleep(sleep_secs) + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {1: 12}, 12, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def _multi_sequence_identical_shape_impl(self, sleep_secs): + self.clear_deferred_exceptions() + dtype = np.float32 + + for shape_tensor_input_dtype in [np.int32, np.int64]: + precreated_shm0_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((2, 1), (4, 2), (8, 3)), + dtype=dtype, + i=0, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + precreated_shm1_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((2, 11), (4, 12), (8, 13)), + dtype=dtype, + i=1, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + precreated_shm2_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((2, 111), (4, 112), (8, 113)), + dtype=dtype, + i=2, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + precreated_shm3_handles = ( + self.precreate_register_dynaseq_shape_tensor_regions( + value_list=((2, 1111), (4, 1112), (8, 1113)), + dtype=dtype, + i=3, + shape_tensor_input_dtype=shape_tensor_input_dtype, + ) + ) + + try: + model_name = tu.get_dyna_sequence_model_name("plan", dtype) + model_name = model_name + "_" + np.dtype(shape_tensor_input_dtype).name + self.check_setup(model_name) + self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) + + corrids = [1001, 1002, 1003, 1004] + threads = [] + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[0], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 1, None), + (None, 4, 2, None), + ("end", 8, 3, None), + ), + self.get_expected_result( + 4 + corrids[0], corrids[0], 3, "end" + ), + precreated_shm0_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[0] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[1], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 11, None), + (None, 4, 12, None), + ("end", 8, 13, None), + ), + self.get_expected_result( + 36 + corrids[1], corrids[1], 13, "end" + ), + precreated_shm1_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[1] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[2], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 111, None), + (None, 4, 112, None), + ("end", 8, 113, None), + ), + self.get_expected_result( + 336 + corrids[2], corrids[2], 113, "end" + ), + precreated_shm2_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[2] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_sequence_shape_tensor_io, + args=( + model_name, + dtype, + corrids[3], + (None, None), + # (flag_str, shape_value, value, pre_delay_ms) + ( + ("start", 2, 1111, None), + (None, 4, 1112, None), + ("end", 8, 1113, None), + ), + self.get_expected_result( + 3336 + corrids[3], corrids[3], 1113, "end" + ), + precreated_shm3_handles, + ), + kwargs={ + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[3] + ), + "using_dynamic_batcher": True, + "shape_tensor_input_dtype": shape_tensor_input_dtype, + }, + ) + ) + + for t in threads: + t.start() + if sleep_secs > 0: + time.sleep(sleep_secs) + for t in threads: + t.join() + self.check_deferred_exception() + self.check_status(model_name, {4: 3}, 3, 12) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + finally: + if TEST_SYSTEM_SHARED_MEMORY: + self.cleanup_shm_regions(precreated_shm0_handles) + self.cleanup_shm_regions(precreated_shm1_handles) + self.cleanup_shm_regions(precreated_shm2_handles) + self.cleanup_shm_regions(precreated_shm3_handles) + + def test_dynaseq_identical_shape_values_series(self): + # Send four sequences with identical shape values in series + # and make sure they get completely batched into batch-size + # 4 inferences. + self._multi_sequence_identical_shape_impl(1) + + def test_dynaseq_identical_shape_values_parallel(self): + # Send four sequences with identical shape values in parallel + # and make sure they get completely batched into batch-size + # 4 inferences. + self._multi_sequence_identical_shape_impl(0) + + def test_dynaseq_different_shape_values_series(self): + # Send four sequences with different shape values in series + # and make sure they don't get batched together. + self._multi_sequence_different_shape_impl(1) + + def test_dynaseq_different_shape_values_parallel(self): + # Send four sequences with different shape values in parallel + # and make sure they don't get batched together. + self._multi_sequence_different_shape_impl(0) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_unit_test/test.sh b/qa/L0_unit_test/test.sh deleted file mode 100755 index 229e8e8ced..0000000000 --- a/qa/L0_unit_test/test.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -export UNIT_TESTS="//src/..." -TEST_LOG="./unit_test.log" - -# Need to have all libraries on a standard path since that is what is -# expected by bazel test. -cp /opt/tensorrtserver/lib/* /usr/lib/. - -rm -f $TEST_LOG -RET=0 - -set +e - -# Return code 3 indicates a test failure so ignore that failure as we -# use 'show_testlogs' to parse out more specific error messages. -(cd /workspace && \ - bazel test -c opt --config=cuda --verbose_failures --cache_test_results=no \ - --build_tests_only -- $(bazel query "tests($UNIT_TESTS)")) > $TEST_LOG 2>&1 -BLDRET=$? -if [ $BLDRET -ne 0 ]; then - RET=1 - if [ $BLDRET -ne 3 ]; then - cat $TEST_LOG - echo -e "\n***\n*** Failed to build\n***" - exit 1 - fi -fi - -grep "test\.log$" $TEST_LOG | /workspace/qa/common/show_testlogs -if [ $? -ne 0 ]; then - RET=1 -fi - -set -e - -if [ $RET -eq 0 ]; then - echo -e "\n***\n*** Test Passed\n***" -else - echo -e "\n***\n*** Test FAILED\n***" -fi - -exit $RET diff --git a/qa/L0_vertex_ai/test.sh b/qa/L0_vertex_ai/test.sh new file mode 100755 index 0000000000..7403bf14cf --- /dev/null +++ b/qa/L0_vertex_ai/test.sh @@ -0,0 +1,722 @@ +#!/bin/bash +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +TEST_RESULT_FILE='test_results.txt' + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 + +rm -rf multi_models single_model +rm -f *.log +rm -f *.out + +CLIENT_TEST_SCRIPT=vertex_ai_test.py +UNIT_TEST_COUNT=8 +CLIENT_LOG="./client.log" + +DATADIR=/data/inferenceserver/${REPO_VERSION} +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_LOG="./server.log" +source ../common/util.sh + +# Set up the multi model repository with the swap and non-swap versions +mkdir multi_models && \ + cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 multi_models/addsub && \ + rm -r multi_models/addsub/2 && rm -r multi_models/addsub/3 && \ + sed -i "s/onnx_int32_int32_int32/addsub/" multi_models/addsub/config.pbtxt && \ + cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 multi_models/subadd && \ + rm -r multi_models/subadd/1 && rm -r multi_models/subadd/2 && \ + sed -i "s/onnx_int32_int32_int32/subadd/" multi_models/subadd/config.pbtxt +mkdir single_model && \ + cp -r multi_models/addsub single_model/. + +# Use Vertex AI's health endpoint to check server status +# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on +# success, 1 on failure +function vertex_ai_wait_for_server_ready() { + local spid="$1"; shift + local wait_time_secs="${1:-30}"; shift + + WAIT_RET=0 + + ping_address="localhost:8080${AIP_HEALTH_ROUTE}" + if [ -n "$AIP_HTTP_PORT" ]; then + ping_address="localhost:${AIP_HTTP_PORT}${AIP_HEALTH_ROUTE}" + fi + + local wait_secs=$wait_time_secs + until test $wait_secs -eq 0 ; do + if ! kill -0 $spid; then + echo "=== Server not running." + WAIT_RET=1 + return + fi + + sleep 1; + + set +e + code=`curl -s -w %{http_code} $ping_address` + set -e + if [ "$code" == "200" ]; then + return + fi + + ((wait_secs--)); + done + + echo "=== Timeout $wait_time_secs secs. Server not ready." + WAIT_RET=1 +} + +# Helper function to unset all AIP variables before test +function unset_vertex_variables() { + unset AIP_MODE + unset AIP_HTTP_PORT + unset AIP_HEALTH_ROUTE + unset AIP_PREDICT_ROUTE + unset AIP_STORAGE_URI +} + +# +# Test default allow-vertex-ai +# +unset_vertex_variables + +# Enable HTTP endpoint to check server readiness in the case of disabling Vertex AI +BASE_SERVER_ARGS="--allow-http true --model-repository=single_model" +export AIP_HEALTH_ROUTE="/health" +export AIP_PREDICT_ROUTE="/predict" + +# Default false +SERVER_ARGS=${BASE_SERVER_ARGS} +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +kill $SERVER_PID +wait $SERVE_PID +set +e +# Expect no message regarding Vertex AI as it is disabled +grep "failed to start Vertex AI service" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected Vertex AI service is disabled\n***" + RET=1 +fi +grep "Started Vertex AI HTTPService at" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected Vertex AI service is disabled\n***" + RET=1 +fi +set -e +# Enable +SERVER_ARGS="${BASE_SERVER_ARGS} --allow-vertex-ai=true" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +kill $SERVER_PID +wait $SERVE_PID +set +e +grep "Started Vertex AI HTTPService at" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected Vertex AI service is enabled\n***" + RET=1 +fi +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +# Default true +# Note that when default true, HTTP / GRPC endpoints will be disabled, +# check those endpoints by enabling one of them at a time and greping keywords +export AIP_MODE=PREDICTION +SERVER_ARGS="--model-repository=single_model --allow-grpc=true" +# Using nowait as 'run_server' requires HTTP endpoint enabled +run_server_nowait +sleep 10 +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +kill $SERVER_PID +wait $SERVE_PID +set +e +grep "Started Vertex AI HTTPService at" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected Vertex AI service is enabled\n***" + RET=1 +fi +grep "Started GRPCInferenceService at" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected GRPC service is enabled\n***" + RET=1 +fi +# Expect no message regarding HTTP as it is disabled +grep "failed to start HTTP service" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected HTTP service is disabled\n***" + RET=1 +fi +grep "Started HTTPService at" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected HTTP service is disabled\n***" + RET=1 +fi +set -e + +# Disable +SERVER_ARGS="${BASE_SERVER_ARGS} --allow-vertex-ai=false --allow-http=true" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi +kill $SERVER_PID +wait $SERVE_PID +set +e +# Expect no message regarding Vertex AI as it is disabled +grep "failed to start Vertex AI service" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected Vertex AI service is disabled\n***" + RET=1 +fi +grep "Started Vertex AI HTTPService at" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected Vertex AI service is disabled\n***" + RET=1 +fi +grep "Started HTTPService at" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected HTTP service is enabled\n***" + RET=1 +fi +# Expect no message regarding GRPC as it is disabled +grep "failed to start GRPC service" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected GRPC service is disabled\n***" + RET=1 +fi +grep "Started GRPCInferenceService at" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected GRPC service is disabled\n***" + RET=1 +fi +set -e + +# +# Test missing route +# +unset_vertex_variables +export AIP_HEALTH_ROUTE="/health" + +SERVER_ARGS="--allow-vertex-ai=true --model-repository=single_model" +run_server_nowait +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" == "0" ]; then + echo -e "\n***\n*** Expect failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + RET=1 +else + set +e + grep "API_PREDICT_ROUTE is not defined for Vertex AI endpoint" $SERVER_LOG + set -e + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on using undefined route\n***" + RET=1 + fi +fi + +unset_vertex_variables +export AIP_PREDICT_ROUTE="/predict" +run_server_nowait +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" == "0" ]; then + echo -e "\n***\n*** Expect failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + RET=1 +else + set +e + grep "AIP_HEALTH_ROUTE is not defined for Vertex AI endpoint" $SERVER_LOG + set -e + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on using undefined route\n***" + RET=1 + fi +fi + +# +# Test endpoints +# +unset_vertex_variables +export AIP_PREDICT_ROUTE="/predict" +export AIP_HEALTH_ROUTE="/health" + +SERVER_ARGS="--allow-vertex-ai=true --model-repository=single_model" +run_server_nowait +# health +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + cat $SERVER_LOG + exit 1 +fi + +# predict (single model) +set +e +python $CLIENT_TEST_SCRIPT >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVE_PID + +# +# AIP_STORAGE_URI / AIP_HTTP_PORT +# +unset_vertex_variables +export AIP_PREDICT_ROUTE="/predict" +export AIP_HEALTH_ROUTE="/health" +export AIP_STORAGE_URI=single_model +export AIP_HTTP_PORT=5234 + +SERVER_ARGS="--allow-vertex-ai=true" +run_server_nowait +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + cat $SERVER_LOG + exit 1 +fi + +set +e +python $CLIENT_TEST_SCRIPT >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVE_PID + +# +# default model +# +unset_vertex_variables +export AIP_MODE=PREDICTION +export AIP_PREDICT_ROUTE="/predict" +export AIP_HEALTH_ROUTE="/health" + +export AIP_STORAGE_URI=single_model +SERVER_ARGS="--vertex-ai-default-model=subadd" +run_server_nowait +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" == "0" ]; then + echo -e "\n***\n*** Expect failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + RET=1 +else + set +e + grep "Expect the default model 'subadd' is loaded" $SERVER_LOG + set -e + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on nonexistent default model\n***" + RET=1 + fi +fi + +export AIP_STORAGE_URI=multi_models +SERVER_ARGS="" +run_server_nowait +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" == "0" ]; then + echo -e "\n***\n*** Expect failed to start $SERVER\n***" + kill $SERVER_PID || true + cat $SERVER_LOG + RET=1 +else + set +e + grep "Expect the model repository contains only a single model if default model is not specified" $SERVER_LOG + set -e + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on unspecified default model\n***" + RET=1 + fi +fi + +# Test AIP_STORAGE_URI won't be used if model repository is specified +SERVER_ARGS="--model-repository=single_model" +run_server_nowait +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + cat $SERVER_LOG + exit 1 +fi + +set +e +# subadd should not be loaded +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/models/subadd/ready" localhost:8080/predict` +if [ "$code" == "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Expect 'subadd' is not loaded\n***" + RET=1 +fi +python $CLIENT_TEST_SCRIPT >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e +kill $SERVER_PID +wait $SERVE_PID + +# Test default model as well as multi model +SERVER_ARGS="--vertex-ai-default-model=addsub" +run_server_nowait +vertex_ai_wait_for_server_ready $SERVER_PID 10 +if [ "$WAIT_RET" != "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + kill $SERVER_PID + wait $SERVER_PID + cat $SERVER_LOG + exit 1 +fi + +set +e +python $CLIENT_TEST_SCRIPT >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $UNIT_TEST_COUNT + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi +set -e + +# Defer the server exit to test redirection as the same time + +# +# Redirect +# + +# Metrics +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: metrics" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "nv_inference_request_success" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected metrics are returned\n***" + RET=1 + fi +fi +set -e + +# All Model stats +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/models/stats" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "model_stats" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected model stats are returned\n***" + RET=1 + fi + grep "addsub" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'addsub' model stats are returned\n***" + RET=1 + fi + grep "subadd" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'subadd' model stats are returned\n***" + RET=1 + fi +fi +set -e + +# Single model stats +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/models/subadd/stats" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "model_stats" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected model stats are returned\n***" + RET=1 + fi + grep "addsub" ./curl.out + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Unexpected 'addsub' model stats are returned\n***" + RET=1 + fi + grep "subadd" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'subadd' model stats are returned\n***" + RET=1 + fi +fi +set -e + +# Server health +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/health/live" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +# Model ready +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/models/addsub/ready" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +# Server metadata +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "extensions" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected server metadata are returned\n***" + RET=1 + fi +fi +set -e + +# Model metadata +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/models/addsub" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "platform" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected model metadata are returned\n***" + RET=1 + fi +fi +set -e + +# Model config +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/models/addsub/config" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "version_policy" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected model configuration are returned\n***" + RET=1 + fi +fi +set -e + +# shared memory (only test "status" as register requires shared memory allocation) +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/systemsharedmemory/status" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "name" ./curl.out + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected no region is registered\n***" + RET=1 + fi +fi +set -e + +# cuda shared memory (only test "status" as register requires shared memory allocation) +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/cudasharedmemory/status" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "name" ./curl.out + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected no region is registered\n***" + RET=1 + fi +fi +set -e + +# repository index +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/repository/index" localhost:8080/predict` +if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "state" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected model index are returned\n***" + RET=1 + fi + grep "addsub" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'addsub' in the index\n***" + RET=1 + fi + grep "subadd" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 'subadd' in the index\n***" + RET=1 + fi +fi +set -e + +# repository control (expect error) +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST -H "X-Vertex-Ai-Triton-Redirect: v2/repository/models/subadd/unload" localhost:8080/predict` +if [ "$code" == "200" ]; then + cat ./curl.out + echo -e "\n***\n*** Test Failed\n***" + RET=1 +else + grep "explicit model load / unload is not allowed" ./curl.out + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected error on model control\n***" + RET=1 + fi +fi +set -e + +kill $SERVER_PID +wait $SERVE_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi +exit $RET diff --git a/qa/L0_vertex_ai/vertex_ai_test.py b/qa/L0_vertex_ai/vertex_ai_test.py new file mode 100755 index 0000000000..b6f9fc42b4 --- /dev/null +++ b/qa/L0_vertex_ai/vertex_ai_test.py @@ -0,0 +1,328 @@ +#!/usr/bin/python +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import sys +import unittest + +import numpy as np +import requests +import test_util as tu +import tritonclient.http as httpclient + + +class VertexAiTest(tu.TestResultCollector): + def setUp(self): + port = os.getenv("AIP_HTTP_PORT", "8080") + predict_endpoint = os.getenv("AIP_PREDICT_ROUTE", "/predict") + self.model_ = os.getenv("TEST_EXPLICIT_MODEL_NAME", "addsub") + self.url_ = "http://localhost:{}{}".format(port, predict_endpoint) + self.input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] + self.expected_output0_data_ = [x * 2 for x in self.input_data_] + self.expected_output1_data_ = [0 for x in self.input_data_] + + def test_predict(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + result = httpclient.InferenceServerClient.parse_response_body(r._content) + + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") + for i in range(16): + self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) + self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) + + def test_predict_specified_model(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/json", + "X-Vertex-Ai-Triton-Redirect": "v2/models/{}/infer".format(self.model_), + } + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + result = httpclient.InferenceServerClient.parse_response_body(r._content) + + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") + if self.model_ == "addsub": + expected_output0_data = [x * 2 for x in self.input_data_] + expected_output1_data = [0 for x in self.input_data_] + else: + expected_output0_data = [0 for x in self.input_data_] + expected_output1_data = [x * 2 for x in self.input_data_] + for i in range(16): + self.assertEqual(output0_data[0][i], expected_output0_data[i]) + self.assertEqual(output1_data[0][i], expected_output1_data[i]) + + def test_predict_request_binary(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size={}".format( + header_length + ) + } + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + result = httpclient.InferenceServerClient.parse_response_body(r._content) + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") + for i in range(16): + self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) + self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) + + def test_predict_response_binary(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=False) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + request_body, _ = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = {"Content-Type": "application/json"} + r = requests.post(self.url_, data=request_body, headers=headers) + r.raise_for_status() + + header_length_str = r.headers["Inference-Header-Content-Length"] + result = httpclient.InferenceServerClient.parse_response_body( + r._content, header_length=int(header_length_str) + ) + + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") + for i in range(16): + self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) + self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) + + def test_malformed_binary_header(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "additional-string/application/vnd.vertex-ai-triton.binary+json;json-header-size={}".format( + header_length + ) + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + def test_malformed_binary_header_not_number(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=additional-string{}".format( + header_length + ) + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + def test_malformed_binary_header_negative_number(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=-123" + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + def test_malformed_binary_header_large_number(self): + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + + # Initialize the data + input_data = np.array(self.input_data_, dtype=np.int32) + input_data = np.expand_dims(input_data, axis=0) + inputs[0].set_data_from_numpy(input_data, binary_data=True) + inputs[1].set_data_from_numpy(input_data, binary_data=False) + + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) + + headers = { + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=12345" + } + r = requests.post(self.url_, data=request_body, headers=headers) + self.assertEqual( + 400, + r.status_code, + "Expected error code {} returned for the request; got: {}".format( + 400, r.status_code + ), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_warmup/decoupled/1/model.py b/qa/L0_warmup/decoupled/1/model.py new file mode 100644 index 0000000000..9827a87f09 --- /dev/null +++ b/qa/L0_warmup/decoupled/1/model.py @@ -0,0 +1,39 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Test model that always returns 0 response for all requests.""" + + def execute(self, requests): + for request in requests: + request.get_response_sender().send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + + return None diff --git a/qa/L0_warmup/decoupled/config.pbtxt b/qa/L0_warmup/decoupled/config.pbtxt new file mode 100644 index 0000000000..8d1f4f79b0 --- /dev/null +++ b/qa/L0_warmup/decoupled/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "decoupled" +backend: "python" +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +instance_group [{ kind: KIND_CPU }] +model_warmup [ +{ + name : "decoupled sample" + batch_size: 1 + inputs { + key: "INPUT" + value: { + data_type: TYPE_FP32 + dims: 4 + zero_data: true + } + } +}] +model_transaction_policy { + decoupled: True +} \ No newline at end of file diff --git a/qa/L0_warmup/failing_infer/1/model.py b/qa/L0_warmup/failing_infer/1/model.py new file mode 100644 index 0000000000..632477c903 --- /dev/null +++ b/qa/L0_warmup/failing_infer/1/model.py @@ -0,0 +1,45 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """Test model that always returns error for all requests.""" + + def execute(self, requests): + responses = [] + + for _ in requests: + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], error=pb_utils.TritonError("An Error Occurred") + ) + ) + + # You must return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses diff --git a/qa/L0_warmup/failing_infer/config.pbtxt b/qa/L0_warmup/failing_infer/config.pbtxt new file mode 100644 index 0000000000..e491844531 --- /dev/null +++ b/qa/L0_warmup/failing_infer/config.pbtxt @@ -0,0 +1,56 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "failing_infer" +backend: "python" +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +instance_group [{ kind: KIND_CPU }] +model_warmup [ +{ + name : "zero sample" + batch_size: 1 + inputs { + key: "INPUT" + value: { + data_type: TYPE_FP32 + dims: 4 + zero_data: true + } + } +}] diff --git a/qa/L0_warmup/raw_mug_data b/qa/L0_warmup/raw_mug_data new file mode 100644 index 0000000000..e9833f54bd Binary files /dev/null and b/qa/L0_warmup/raw_mug_data differ diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh new file mode 100755 index 0000000000..a535aed25b --- /dev/null +++ b/qa/L0_warmup/test.sh @@ -0,0 +1,487 @@ +#!/bin/bash +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +CLIENT=../clients/image_client +CLIENT_LOG="./client.log" +CLIENT_PY=./test_infer_shm_leak.py +EXPECTED_NUM_TESTS="1" +TEST_RESULT_FILE='test_results.txt' + +IMAGE="../images/vulture.jpeg" + +DATADIR=`pwd`/models + +# If BACKENDS not specified, set to all +BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch plan"} + +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=$DATADIR --log-verbose=1 --exit-timeout-secs=120" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + +RET=0 +rm -fr *.txt + +for BACKEND in ${BACKENDS}; do + rm -f $SERVER_LOG $CLIENT_LOG + # Test for fixed-size data type + # Use the addsub models as example. + rm -fr models && mkdir models + cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/${BACKEND}_float32_float32_float32 models/. && \ + cp -r /data/inferenceserver/${REPO_VERSION}/qa_sequence_model_repository/${BACKEND}_sequence_int32 models/. + + INPUT_PREFIX="INPUT" + IDENTITY_INPUT_PREFIX="INPUT" && [ "$BACKEND" == "libtorch" ] && IDENTITY_INPUT_PREFIX="INPUT__" + SEQ_INPUT="INPUT" && [ "$BACKEND" == "libtorch" ] && SEQ_INPUT="INPUT__0" + START="START" && [ "$BACKEND" == "libtorch" ] && START="START__1" + READY="READY" && [ "$BACKEND" == "libtorch" ] && READY="READY__2" + + # 2 instances per device with random / zero data. + # The zero data sample will run twice + # + # Provide warmup instruction (batch size 1) in model config + (cd models/${BACKEND}_float32_float32_float32 && \ + echo "model_warmup [{" >> config.pbtxt && \ + echo " name : \"regular sample\"" >> config.pbtxt && \ + echo " batch_size: 1" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${INPUT_PREFIX}0\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${INPUT_PREFIX}1\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_FP32" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " random_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}]" >> config.pbtxt ) + + # zero data. For realistic sequence model, 'count' may not work + # well because the model will expect a valid sequence of requests which + # should be represented by a series of warmup samples. 'count > 1' + # essentially "resends" one of the sample, which may invalidate the + # sequence. This is okay for this specific test because the synthetic model + # is not data sensitive. + # + # Instruction for sequence model (batch size 8), need to specify control tensor + (cd models/${BACKEND}_sequence_int32 && \ + echo "model_warmup [{" >> config.pbtxt && \ + echo " name : \"sequence sample\"" >> config.pbtxt && \ + echo " count : 2" >> config.pbtxt && \ + echo " batch_size: 8" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${SEQ_INPUT}\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_INT32" >> config.pbtxt && \ + echo " dims: 1" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${START}\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_INT32" >> config.pbtxt && \ + echo " dims: 1" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${READY}\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_INT32" >> config.pbtxt && \ + echo " dims: 1" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}]" >> config.pbtxt ) + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + grep "is running warmup sample 'regular sample'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected warmup for stateless model\n***" + RET=1 + fi + grep "is running warmup sample 'sequence sample' for iteration 1" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 1st warmup iteration for stateful model\n***" + RET=1 + fi + grep "is running warmup sample 'sequence sample' for iteration 2" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected 2nd warmup iteration for stateful model\n***" + RET=1 + fi + grep "failed to run warmup" $SERVER_LOG + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected no warmup error\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + + # Test for variable-size data type (string) + rm -fr models && mkdir models + SUPPORT_STRING=0 && ([[ $BACKEND == "savedmodel" ]] || [[ $BACKEND == "onnx" ]] || [[ $BACKEND == "savedmodel" ]]) && SUPPORT_STRING=1 + if [ "$SUPPORT_STRING" == "1" ] ; then + cp -r /data/inferenceserver/${REPO_VERSION}/qa_sequence_model_repository/${BACKEND}_sequence_object models/. + cp -r /data/inferenceserver/${REPO_VERSION}/qa_identity_model_repository/${BACKEND}_zero_1_object models/. + + # random and zero data (two samples) + # + # Provide warmup instruction (batch size 1) in model config + (cd models/${BACKEND}_zero_1_object && \ + echo "model_warmup [" >> config.pbtxt && \ + echo "{" >> config.pbtxt && \ + echo " name : \"zero string stateless\"" >> config.pbtxt && \ + echo " batch_size: 1" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${IDENTITY_INPUT_PREFIX}0\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_STRING" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}," >> config.pbtxt && \ + echo "{" >> config.pbtxt && \ + echo " name : \"random string stateless\"" >> config.pbtxt && \ + echo " batch_size: 1" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${IDENTITY_INPUT_PREFIX}0\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_STRING" >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " random_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}" >> config.pbtxt && \ + echo "]" >> config.pbtxt ) + + # user provided data + # + # Instruction for sequence model (batch size 8), need to specify control tensor + (cd models/${BACKEND}_sequence_object && \ + echo "model_warmup [{" >> config.pbtxt && \ + echo " name : \"string statefull\"" >> config.pbtxt && \ + echo " batch_size: 8" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${SEQ_INPUT}\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_STRING" >> config.pbtxt && \ + echo " dims: 1" >> config.pbtxt && \ + echo " input_data_file: \"raw_string_data\"" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${START}\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_INT32" >> config.pbtxt && \ + echo " dims: 1" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " inputs {" >> config.pbtxt && \ + echo " key: \"${READY}\"" >> config.pbtxt && \ + echo " value: {" >> config.pbtxt && \ + echo " data_type: TYPE_INT32" >> config.pbtxt && \ + echo " dims: 1" >> config.pbtxt && \ + echo " zero_data: true" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo " }" >> config.pbtxt && \ + echo "}]" >> config.pbtxt ) + + # Prepare string data (one element that is "233") + mkdir -p models/${BACKEND}_sequence_object/warmup && \ + (cd models/${BACKEND}_sequence_object/warmup && \ + echo -n -e '\x03\x00\x00\x00\x32\x33\x33' > raw_string_data) + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + grep "is running warmup sample 'zero string stateless'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected warmup for zero string stateless model\n***" + RET=1 + fi + grep "is running warmup sample 'random string stateless'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected warmup for random string stateless model\n***" + RET=1 + fi + grep "is running warmup sample 'string statefull'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected warmup for string stateful model\n***" + RET=1 + fi + grep "failed to run warmup" $SERVER_LOG + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected no warmup error\n***" + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + fi + + if [ "$BACKEND" == "graphdef" ]; then + # Show effect of warmup by using a TF model with TF-TRT optimization which is + # known to be slow on first inference. + # Note: model can be obatined via the fetching script in docs/example + rm -fr models && \ + mkdir models && \ + cp -r /data/inferenceserver/${REPO_VERSION}/tf_model_store/inception_v3_graphdef models/. + + # Enable TF-TRT optimization + (cd models/inception_v3_graphdef && \ + echo "optimization { execution_accelerators { gpu_execution_accelerator : [ { name : \"tensorrt\"} ] } }" >> config.pbtxt) + + # Duplicate the same model with warmup enabled + cp -r models/inception_v3_graphdef models/inception_v3_warmup && + (cd models/inception_v3_warmup && \ + sed -i 's/inception_v3_graphdef/inception_v3_warmup/' config.pbtxt) + + (cd models/inception_v3_warmup && \ + echo 'model_warmup [{' >> config.pbtxt && \ + echo ' name : "image sample"' >> config.pbtxt && \ + echo ' batch_size: 1' >> config.pbtxt && \ + echo ' inputs {' >> config.pbtxt && \ + echo ' key: "input"' >> config.pbtxt && \ + echo ' value: {' >> config.pbtxt && \ + echo ' data_type: TYPE_FP32' >> config.pbtxt && \ + echo ' dims: [ 299, 299, 3 ]' >> config.pbtxt && \ + echo ' input_data_file: "raw_mug_data"' >> config.pbtxt && \ + echo ' }' >> config.pbtxt && \ + echo ' }' >> config.pbtxt && \ + echo '}]' >> config.pbtxt ) + + # prepare provided data instead of synthetic one + mkdir -p models/inception_v3_warmup/warmup && \ + cp raw_mug_data models/inception_v3_warmup/warmup/. + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + set +e + + grep "is running warmup sample 'image sample'" $SERVER_LOG + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected warmup for image model\n***" + RET=1 + fi + grep "failed to run warmup" $SERVER_LOG + if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected no warmup error\n***" + RET=1 + fi + + # Time the first inference for both models + time $CLIENT -m inception_v3_graphdef -s INCEPTION $IMAGE -i grpc -u localhost:8001 >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 + fi + time $CLIENT -m inception_v3_warmup -s INCEPTION $IMAGE -i grpc -u localhost:8001 >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test Failed\n***" + cat $CLIENT_LOG + RET=1 + fi + + set -e + + kill $SERVER_PID + wait $SERVER_PID + fi +done + +# Test warmup sample failure +rm -fr models && \ + mkdir models && \ + cp -r failing_infer models/. + +run_server +if [ "$SERVER_PID" != "0" ]; then + echo -e "\n***\n*** Expect fail to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +grep "failed to run warmup sample 'zero sample': An Error Occurred;" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected warmup error\n***" + cat $SERVER_LOG + RET=1 +fi +set -e + +# Test decoupled model +rm -fr models && \ + mkdir models && \ + cp -r decoupled models/. + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +grep "is running warmup sample 'decoupled sample'" $SERVER_LOG +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Failed. Expected warmup for decoupled model\n***" + RET=1 +fi +grep "failed to run warmup" $SERVER_LOG +if [ $? -eq 0 ]; then + echo -e "\n***\n*** Failed. Expected no warmup error\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + +# Test the onnx model to verify that the memory type of the output tensor +# remains unchanged with the warmup setting +pip3 uninstall -y torch +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html + +rm -fr models && mkdir models +cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/onnx_nobatch_float32_float32_float32 models/. +(cd models/onnx_nobatch_float32_float32_float32 && \ + echo "" >> config.pbtxt && \ + echo 'instance_group [{' >> config.pbtxt && \ + echo ' kind : KIND_GPU' >> config.pbtxt && \ + echo '}]' >> config.pbtxt && \ + echo 'model_warmup [{' >> config.pbtxt && \ + echo ' name : "sample"' >> config.pbtxt && \ + echo ' batch_size: 1' >> config.pbtxt && \ + echo ' inputs {' >> config.pbtxt && \ + echo ' key: "INPUT0"' >> config.pbtxt && \ + echo ' value: {' >> config.pbtxt && \ + echo ' data_type: TYPE_FP32' >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " zero_data: false" >> config.pbtxt && \ + echo ' }' >> config.pbtxt && \ + echo ' }' >> config.pbtxt && \ + echo ' inputs {' >> config.pbtxt && \ + echo ' key: "INPUT1"' >> config.pbtxt && \ + echo ' value: {' >> config.pbtxt && \ + echo ' data_type: TYPE_FP32' >> config.pbtxt && \ + echo " dims: 16" >> config.pbtxt && \ + echo " zero_data: false" >> config.pbtxt && \ + echo ' }' >> config.pbtxt && \ + echo ' }' >> config.pbtxt && \ + echo '}]' >> config.pbtxt ) + +mkdir -p models/bls_onnx_warmup/1/ +cp ../python_models/bls_onnx_warmup/model.py models/bls_onnx_warmup/1/ +cp ../python_models/bls_onnx_warmup/config.pbtxt models/bls_onnx_warmup/. + +cp ../L0_backend_python/test_infer_shm_leak.py . +sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +export MODEL_NAME='bls_onnx_warmup' +python3 -m pytest --junitxml=warmup.report.xml $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** 'bls_onnx_warmup' test FAILED. \n***" + cat $CLIENT_LOG + RET=1 +fi + +set -e + + +kill $SERVER_PID +wait $SERVER_PID + + +if [ $RET -eq 1 ]; then + cat $CLIENT_LOG + cat $SERVER_LOG + echo -e "\n***\n*** Test Failed \n***" +else + echo -e "\n***\n*** Test Passed \n***" +fi + +exit $RET diff --git a/qa/common/busy_op_kernel.cc b/qa/common/busy_op_kernel.cc new file mode 100644 index 0000000000..119ed0a1ce --- /dev/null +++ b/qa/common/busy_op_kernel.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include + +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_shape.h" + +using namespace tensorflow; // NOLINT(build/namespaces) + +REGISTER_OP("BusyLoop").Input("input: int32").Output("output: int32").Doc(R"doc( +Busy waits for input number of clock cycles +)doc"); + +void BusyLoopKernelLauncher( + const Eigen::GpuDevice& device, const int* num_delay_cycles, int* out); + +class BusyLoopOp : public OpKernel { + public: + explicit BusyLoopOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override + { + // Grab the input + const Tensor& input_tensor = context->input(0); + auto num_delay_cycles = input_tensor.flat(); + + // Create dummy output + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK( + context, + context->allocate_output(0, input_tensor.shape(), &output_tensor)); + auto output = output_tensor->template flat(); + + // Verify input dimension + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input_tensor.shape()), + errors::InvalidArgument( + "BusyLoop expects a single value as a 1-D Vector")); + + // Call the cuda kernel launcher + BusyLoopKernelLauncher( + context->eigen_device(), num_delay_cycles.data(), + output.data()); + } +}; + +REGISTER_KERNEL_BUILDER(Name("BusyLoop").Device(DEVICE_GPU), BusyLoopOp); diff --git a/qa/common/busy_op_kernel.cu.cc b/qa/common/busy_op_kernel.cu.cc new file mode 100644 index 0000000000..e7d8c42ce0 --- /dev/null +++ b/qa/common/busy_op_kernel.cu.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include +#include + +#include "unsupported/Eigen/CXX11/Tensor" + +__device__ long store_now[1]; + +__global__ void +BusyLoopKernel(const int* num_delay_cycles, int* out) +{ + // As shown in + // https://stackoverflow.com/questions/11217117/equivalent-of-usleep-in-cuda-kernel + clock_t start = clock(); + + for (;;) { + clock_t now = clock(); + // Adjust for overflow + clock_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= num_delay_cycles[0]) { + break; + } + // Prevent nvcc optimizations + store_now[0] = cycles; + } +} + +void +BusyLoopKernelLauncher( + const Eigen::GpuDevice& device, const int* num_delay_cycles, int* out) +{ + auto stream = device.stream(); + BusyLoopKernel<<<1, 256, 0, stream>>>(num_delay_cycles, out); +} + +#endif diff --git a/qa/common/check_copyright.py b/qa/common/check_copyright.py index 11c32902e6..8aea78e9fd 100755 --- a/qa/common/check_copyright.py +++ b/qa/common/check_copyright.py @@ -1,6 +1,6 @@ -#!/usr/bin/python +#!/usr/bin/env python3 -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,24 +28,71 @@ import argparse import os +import pathlib import re FLAGS = None -SKIP_EXTS = ('jpeg', 'jpg', 'pgm', 'png', - 'log', 'serverlog', - 'preprocessed', 'jmx', 'gz', - 'caffemodel', 'prototxt') -SKIP_PATHS = ('docs/examples/model_repository', - 'serving', - 'src/servables/caffe2/testdata', - 'src/servables/tensorflow/testdata', - 'src/servables/tensorrt/testdata', - 'src/test/testdata', - 'tools/patch', - 'VERSION') - -COPYRIGHT =''' -Copyright (c) YYYY, NVIDIA CORPORATION. All rights reserved. +SKIP_EXTS = ( + ".jpeg", + ".jpg", + ".pgm", + ".png", + ".log", + ".preprocessed", + ".jmx", + ".gz", + ".json", + ".pdf", + ".so", + ".onnx", + ".svg", + "pull_request_template.md", +) +REPO_PATH_FROM_THIS_FILE = "../.." +SKIP_PATHS = ( + "build", + "deploy/gke-marketplace-app/.gitignore", + "deploy/gke-marketplace-app/server-deployer/chart/.helmignore", + "deploy/gcp/.helmignore", + "deploy/aws/.helmignore", + "deploy/fleetcommand/.helmignore", + "docs/.gitignore", + "docs/_static/.gitattributes", + "docs/examples/model_repository", + "docs/examples/jetson", + "docker", + "qa/common/cuda_op_kernel.cu.cc.patch", + "qa/ensemble_models/mix_platform_float32_float32_float32/output0_labels.txt", + "qa/ensemble_models/mix_type_int32_float32_float32/output0_labels.txt", + "qa/ensemble_models/mix_ensemble_int32_float32_float32/output0_labels.txt", + "qa/ensemble_models/wrong_label_int32_float32_float32/output0_labels.txt", + "qa/ensemble_models/label_override_int32_float32_float32/output0_labels.txt", + "qa/L0_model_config/noautofill_platform", + "qa/L0_model_config/autofill_noplatform", + "qa/L0_model_config/autofill_noplatform_success", + "qa/L0_model_config/special_cases", + "qa/L0_model_config/cli_messages/cli_override/expected", + "qa/L0_model_config/cli_messages/cli_deprecation/expected", + "qa/L0_model_namespacing/test_duplication", + "qa/L0_model_namespacing/test_dynamic_resolution", + "qa/L0_model_namespacing/test_ensemble_duplication", + "qa/L0_model_namespacing/test_no_duplication", + "qa/L0_perf_nomodel/baseline", + "qa/L0_perf_nomodel/legacy_baseline", + "qa/L0_warmup/raw_mug_data", + "qa/L0_java_resnet/expected_output_data", + "qa/L0_trt_dla_jetson/trt_dla_model_store", + "qa/openvino_models/dynamic_batch", + "qa/openvino_models/fixed_batch", + "CITATION.cff", + "TRITON_VERSION", + ".github/ISSUE_TEMPLATE", + ".github/PULL_REQUEST_TEMPLATE", +) + +COPYRIGHT_YEAR_RE = "Copyright( \\(c\\))? 20[1-9][0-9](-(20)?[1-9][0-9])?(,((20[2-9][0-9](-(20)?[2-9][0-9])?)|([2-9][0-9](-[2-9][0-9])?)))*,? NVIDIA CORPORATION( & AFFILIATES)?. All rights reserved." + +COPYRIGHT = """ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -70,40 +117,52 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -''' +""" + +repo_abs_path = ( + pathlib.Path(__file__).parent.joinpath(REPO_PATH_FROM_THIS_FILE).resolve() +) + +copyright_year_re = re.compile(COPYRIGHT_YEAR_RE) -copyright_list = [l.rstrip() for i, l in enumerate(COPYRIGHT.splitlines()) if i > 0] def visit(path): if FLAGS.verbose: print("visiting " + path) for skip in SKIP_EXTS: - if path.endswith('.' + skip): + if path.endswith(skip): if FLAGS.verbose: print("skipping due to extension: " + path) return True for skip in SKIP_PATHS: - if path.startswith(skip): + if str(pathlib.Path(path).resolve()).startswith( + str(repo_abs_path.joinpath(skip).resolve()) + ): if FLAGS.verbose: print("skipping due to path prefix: " + path) return True - with open(path, 'r') as f: + with open(path, "r") as f: first_line = True line = None try: for fline in f: line = fline - # Skip any '#!' or '..' (from rst) lines at the start - # of the file + # Skip any '#!', '..', ' + +The models in this directory are TF2/keras models converted into OpenVINO +models. The "fixed_batch" model has a fixed batch dimension of 1 and the +"dynamic_batch" model has a variable batch dimension. + +The models are currently in **beta**, which they might not work as expected and +could be **changed, moved or deleted without warning** in the future. diff --git a/qa/openvino_models/dynamic_batch/1/model.bin b/qa/openvino_models/dynamic_batch/1/model.bin new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/openvino_models/dynamic_batch/1/model.mapping b/qa/openvino_models/dynamic_batch/1/model.mapping new file mode 100644 index 0000000000..4705831777 --- /dev/null +++ b/qa/openvino_models/dynamic_batch/1/model.mapping @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qa/openvino_models/dynamic_batch/1/model.xml b/qa/openvino_models/dynamic_batch/1/model.xml new file mode 100644 index 0000000000..59594953c6 --- /dev/null +++ b/qa/openvino_models/dynamic_batch/1/model.xml @@ -0,0 +1,166 @@ + + + + + + + + + + + 1 + 4 + + + + + + + + + + + + + + 1 + 4 + + + + + + + + + + + + + + 1 + 4 + + + 1 + 4 + + + + + 1 + 4 + + + + + + + + + + + + + + 1 + 4 + + + 1 + 4 + + + + + 1 + 4 + + + + + + + + + + + + + 1 + 4 + + + + + + + + + + 1 + 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qa/openvino_models/fixed_batch/1/model.bin b/qa/openvino_models/fixed_batch/1/model.bin new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/openvino_models/fixed_batch/1/model.mapping b/qa/openvino_models/fixed_batch/1/model.mapping new file mode 100644 index 0000000000..bd1a4eccb8 --- /dev/null +++ b/qa/openvino_models/fixed_batch/1/model.mapping @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qa/openvino_models/fixed_batch/1/model.xml b/qa/openvino_models/fixed_batch/1/model.xml new file mode 100644 index 0000000000..e0f8954866 --- /dev/null +++ b/qa/openvino_models/fixed_batch/1/model.xml @@ -0,0 +1,152 @@ + + + + + + + + + + + 1 + 4 + + + + + + + + + + + 1 + 4 + + + + + + + + + + + 1 + 4 + + + 1 + 4 + + + + + 1 + 4 + + + + + + + + + + + 1 + 4 + + + 1 + 4 + + + + + 1 + 4 + + + + + + + + + + 1 + 4 + + + + + + + + + + 1 + 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qa/python_models/add_sub/config.pbtxt b/qa/python_models/add_sub/config.pbtxt new file mode 100644 index 0000000000..39bd6771d0 --- /dev/null +++ b/qa/python_models/add_sub/config.pbtxt @@ -0,0 +1,58 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/add_sub/model.py b/qa/python_models/add_sub/model.py new file mode 100644 index 0000000000..0868014804 --- /dev/null +++ b/qa/python_models/add_sub/model.py @@ -0,0 +1,74 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """This function is called on inference request.""" + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + ) + else: + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) + return responses diff --git a/qa/python_models/add_sub_gpu/config.pbtxt b/qa/python_models/add_sub_gpu/config.pbtxt new file mode 100644 index 0000000000..dd4a3ebecf --- /dev/null +++ b/qa/python_models/add_sub_gpu/config.pbtxt @@ -0,0 +1,63 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "add_sub_gpu" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 4 ] + + + } +] + +instance_group [ { kind: KIND_GPU }] diff --git a/qa/python_models/async_execute_decouple/config.pbtxt b/qa/python_models/async_execute_decouple/config.pbtxt new file mode 100644 index 0000000000..847661d176 --- /dev/null +++ b/qa/python_models/async_execute_decouple/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 8 + +input [ + { + name: "WAIT_SECONDS" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "DUMMY_OUT" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] +model_transaction_policy { decoupled: True } diff --git a/qa/python_models/async_execute_decouple/model.py b/qa/python_models/async_execute_decouple/model.py new file mode 100644 index 0000000000..8a529c209c --- /dev/null +++ b/qa/python_models/async_execute_decouple/model.py @@ -0,0 +1,77 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import asyncio + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + async def execute(self, requests): + processed_requests = [] + async_tasks = [] + for request in requests: + wait_secs_tensors = pb_utils.get_input_tensor_by_name( + request, "WAIT_SECONDS" + ).as_numpy() + for wait_secs_tensor in wait_secs_tensors: + wait_secs = wait_secs_tensor[0] + if wait_secs < 0: + self.raise_value_error(requests) + async_tasks.append(asyncio.create_task(asyncio.sleep(wait_secs))) + processed_requests.append( + { + "response_sender": request.get_response_sender(), + "batch_size": wait_secs_tensors.shape[0], + } + ) + + # This decoupled execute should be scheduled to run in the background + # concurrently with other instances of decoupled execute, as long as the event + # loop is not blocked. + await asyncio.gather(*async_tasks) + + for p_req in processed_requests: + response_sender = p_req["response_sender"] + batch_size = p_req["batch_size"] + + output_tensors = pb_utils.Tensor( + "DUMMY_OUT", np.array([0 for i in range(batch_size)], np.float32) + ) + response = pb_utils.InferenceResponse(output_tensors=[output_tensors]) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + + return None + + def raise_value_error(self, requests): + # TODO: Model may raise exception without sending complete final + for request in requests: + response_sender = request.get_response_sender() + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + raise ValueError("wait_secs cannot be negative") diff --git a/qa/python_models/async_execute_decouple_bls/config.pbtxt b/qa/python_models/async_execute_decouple_bls/config.pbtxt new file mode 100644 index 0000000000..847661d176 --- /dev/null +++ b/qa/python_models/async_execute_decouple_bls/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 8 + +input [ + { + name: "WAIT_SECONDS" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "DUMMY_OUT" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] +model_transaction_policy { decoupled: True } diff --git a/qa/python_models/async_execute_decouple_bls/model.py b/qa/python_models/async_execute_decouple_bls/model.py new file mode 100644 index 0000000000..a2fd5abf94 --- /dev/null +++ b/qa/python_models/async_execute_decouple_bls/model.py @@ -0,0 +1,60 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import asyncio + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + async def _execute_a_request(self, request): + input_tensor = pb_utils.get_input_tensor_by_name( + request, "WAIT_SECONDS" + ).as_numpy() + bls_input_tensor = pb_utils.Tensor("WAIT_SECONDS", input_tensor) + bls_request = pb_utils.InferenceRequest( + model_name="async_execute_decouple", + inputs=[bls_input_tensor], + requested_output_names=["DUMMY_OUT"], + ) + bls_responses = await bls_request.async_exec(decoupled=True) + response_sender = request.get_response_sender() + for bls_response in bls_responses: + bls_output_tensor = pb_utils.get_output_tensor_by_name( + bls_response, "DUMMY_OUT" + ).as_numpy() + output_tensor = pb_utils.Tensor("DUMMY_OUT", bls_output_tensor) + response = pb_utils.InferenceResponse(output_tensors=[output_tensor]) + response_sender.send(response) + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + async def execute(self, requests): + async_futures = [] + for request in requests: + async_future = self._execute_a_request(request) + async_futures.append(async_future) + await asyncio.gather(*async_futures) + return None diff --git a/qa/python_models/auto_complete/model.py b/qa/python_models/auto_complete/model.py new file mode 100644 index 0000000000..7f67182387 --- /dev/null +++ b/qa/python_models/auto_complete/model.py @@ -0,0 +1,89 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + return auto_complete_model_config + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """This function is called on inference request.""" + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + ) + else: + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) + return responses diff --git a/qa/python_models/auto_complete_error/model.py b/qa/python_models/auto_complete_error/model.py new file mode 100644 index 0000000000..1d611c36d5 --- /dev/null +++ b/qa/python_models/auto_complete_error/model.py @@ -0,0 +1,52 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + """ + The body of this model doesn't matter. The main purpose of this model is + to test correct handling of Python errors in the `auto_complete_config` + function. + """ + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input0) + auto_complete_model_config.add_input(input1) + auto_complete_model_config.add_output(output0) + auto_complete_model_config.add_output(output1) + + undefined_variable + + return auto_complete_model_config + + def execute(self, requests): + pass diff --git a/qa/python_models/bls/config.pbtxt b/qa/python_models/bls/config.pbtxt new file mode 100644 index 0000000000..06bd3d41af --- /dev/null +++ b/qa/python_models/bls/config.pbtxt @@ -0,0 +1,38 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/bls/model.py b/qa/python_models/bls/model.py new file mode 100644 index 0000000000..2f92bbbbdd --- /dev/null +++ b/qa/python_models/bls/model.py @@ -0,0 +1,812 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import gc +import os +import sys +import threading +import unittest +from multiprocessing import Pool + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + +_deferred_exceptions_lock = threading.Lock() +_deferred_exceptions = [] + + +def bls_add_sub(_=None): + input0_np = np.random.randn(*[16]) + input0_np = input0_np.astype(np.float32) + input1_np = np.random.randn(*[16]) + input1_np = input1_np.astype(np.float32) + input0 = pb_utils.Tensor("INPUT0", input0_np) + input1 = pb_utils.Tensor("INPUT1", input1_np) + infer_request = pb_utils.InferenceRequest( + model_name="add_sub", + inputs=[input0, input1], + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) + infer_response = infer_request.exec() + if infer_response.has_error(): + return False + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") + if output0 is None or output1 is None: + return False + + expected_output_0 = input0.as_numpy() + input1.as_numpy() + expected_output_1 = input0.as_numpy() - input1.as_numpy() + + if not np.all(expected_output_0 == output0.as_numpy()): + return False + + if not np.all(expected_output_1 == output1.as_numpy()): + return False + + return True + + +def bls_square(_=None): + input0_np = np.random.randint(16, size=1, dtype=np.int32) + input0 = pb_utils.Tensor("IN", input0_np) + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", inputs=[input0], requested_output_names=["OUT"] + ) + infer_responses = infer_request.exec(decoupled=True) + + response_count = 0 + + if infer_responses: + for infer_response in infer_responses: + if infer_response.has_error(): + return False + + if len(infer_response.output_tensors()) > 0: + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + if output0 is None: + return False + + expected_output = input0.as_numpy() + + if not np.all(expected_output == output0.as_numpy()): + return False + + response_count += 1 + + if not np.all(input0.as_numpy() == response_count - 1): + return False + + return True + + +def bls_libtorch(model_name, result_device): + shape = [16] + input0_np = np.random.rand(*shape).astype(np.float32) + input1_np = np.random.rand(*shape).astype(np.float32) + input0 = pb_utils.Tensor("INPUT0", input0_np) + input1 = pb_utils.Tensor("INPUT1", input1_np) + + if result_device == "CPU": + preferred_memory = pb_utils.PreferredMemory(pb_utils.TRITONSERVER_MEMORY_CPU) + else: + preferred_memory = pb_utils.PreferredMemory(pb_utils.TRITONSERVER_MEMORY_GPU, 0) + + infer_request = pb_utils.InferenceRequest( + model_name=model_name, + model_version=1, + inputs=[input0, input1], + requested_output_names=["OUTPUT__0", "OUTPUT__1"], + preferred_memory=preferred_memory, + ) + + infer_response = infer_request.exec() + if infer_response.has_error(): + return False + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT__0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT__1") + if output0 is None or output1 is None: + return False + + expected_output_0 = input0.as_numpy() + input1.as_numpy() + expected_output_1 = input0.as_numpy() - input1.as_numpy() + + if result_device == "CPU": + if not output0.is_cpu() or not output1.is_cpu(): + return False + + if not np.all(expected_output_0 == output0.as_numpy()): + return False + + if not np.all(expected_output_1 == output1.as_numpy()): + return False + else: + if output0.is_cpu() or output1.is_cpu(): + return False + output0 = from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy() + output1 = from_dlpack(output1.to_dlpack()).to("cpu").cpu().detach().numpy() + + if not np.all(output0 == expected_output_0): + return False + if not np.all(output1 == expected_output_1): + return False + + return True + + +class PBBLSTest(unittest.TestCase): + def setUp(self): + self._is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False + + def add_deferred_exception(self, ex): + global _deferred_exceptions + with _deferred_exceptions_lock: + _deferred_exceptions.append(ex) + + def check_deferred_exception(self): + with _deferred_exceptions_lock: + if len(_deferred_exceptions) > 0: + raise _deferred_exceptions[0] + + def test_bls_wrong_inputs(self): + input0 = pb_utils.Tensor("INPUT0", np.random.randn(*[1, 16])) + + if self._is_decoupled: + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", inputs=[], requested_output_names=["OUT"] + ) + infer_responses = infer_request.exec(decoupled=True) + for infer_response in infer_responses: + self.assertTrue(infer_response.has_error()) + self.assertIn( + "expected 1 inputs but got 0 inputs for model 'square_int32'. Got input(s) [], but missing required input(s) ['IN']. Please provide all required input(s).", + infer_response.error().message(), + ) + self.assertTrue(len(infer_response.output_tensors()) == 0) + else: + infer_request = pb_utils.InferenceRequest( + model_name="add_sub", + inputs=[input0], + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) + infer_response = infer_request.exec() + self.assertTrue(infer_response.has_error()) + self.assertIn( + "expected 2 inputs but got 1 inputs for model 'add_sub'", + infer_response.error().message(), + ) + self.assertTrue(len(infer_response.output_tensors()) == 0) + + def _send_bls_sequence_requests(self, correlation_id, is_decoupled): + # Start request + try: + input = pb_utils.Tensor("INPUT", np.array([1000], dtype=np.int32)) + + infer_request = pb_utils.InferenceRequest( + model_name="onnx_nobatch_sequence_int32", + inputs=[input], + requested_output_names=["OUTPUT"], + flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START, + correlation_id=correlation_id, + ) + self.assertTrue( + infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START + ) + infer_response = infer_request.exec() + self.assertFalse(infer_response.has_error()) + output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT") + self.assertFalse(output.is_cpu()) + output = from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy() + self.assertEqual(output[0], input.as_numpy()[0]) + + for i in range(10): + input = pb_utils.Tensor("INPUT", np.array([i], dtype=np.int32)) + infer_request = pb_utils.InferenceRequest( + model_name="onnx_nobatch_sequence_int32", + inputs=[input], + requested_output_names=["OUTPUT"], + correlation_id=correlation_id, + ) + + if is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + self.assertFalse(infer_response.has_error()) + + # The new output is the previous output + the current input + expected_output = output[0] + i + output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT") + self.assertFalse(output.is_cpu()) + output = ( + from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy() + ) + self.assertEqual(output[0], expected_output) + + # Final request + input = pb_utils.Tensor("INPUT", np.array([2000], dtype=np.int32)) + + infer_request = pb_utils.InferenceRequest( + model_name="onnx_nobatch_sequence_int32", + inputs=[input], + requested_output_names=["OUTPUT"], + correlation_id=correlation_id, + ) + infer_request.set_flags(pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END) + self.assertTrue( + infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END + ) + + if is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + + self.assertFalse(infer_response.has_error()) + expected_output = output[0] + input.as_numpy()[0] + output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT") + self.assertFalse(output.is_cpu()) + output = from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy() + self.assertEqual(output[0], expected_output) + except Exception as e: + self.add_deferred_exception(e) + + def test_bls_sequence(self): + # Send 2 sequence of BLS requests simultaneously and check the responses. + threads = [] + thread1 = threading.Thread( + target=self._send_bls_sequence_requests, + args=( + 1000, + self._is_decoupled, + ), + ) + threads.append(thread1) + thread2 = threading.Thread( + target=self._send_bls_sequence_requests, + args=( + 1001, + self._is_decoupled, + ), + ) + threads.append(thread2) + + for thread in threads: + thread.start() + + for thread in threads: + thread.join() + + # Check if any of the threads had an exception + self.check_deferred_exception() + + def test_bls_incorrect_args(self): + with self.assertRaises(TypeError): + pb_utils.InferenceRequest( + inputs=[], requested_output_names=["OUTPUT0", "OUTPUT1"] + ) + + with self.assertRaises(TypeError): + pb_utils.InferenceRequest( + model_name="add_sub", requested_output_names=["OUTPUT0", "OUTPUT1"] + ) + + with self.assertRaises(TypeError): + pb_utils.InferenceRequest(model_name="add_sub", inputs=[]) + + def _get_gpu_bls_outputs(self, input0_pb, input1_pb, is_decoupled): + """ + This function is created to test that the DLPack container works + properly when the inference response and outputs go out of scope. + """ + infer_request = pb_utils.InferenceRequest( + model_name="dlpack_add_sub", + inputs=[input0_pb, input1_pb], + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) + if is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + + self.assertFalse(infer_response.has_error()) + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") + self.assertIsNotNone(output0) + self.assertIsNotNone(output1) + + # When one of the inputs is in GPU the output returned by the model must + # be in GPU, otherwise the outputs will be in CPU. + if not input0_pb.is_cpu() or not input1_pb.is_cpu(): + self.assertTrue((not output0.is_cpu()) and (not output1.is_cpu())) + else: + self.assertTrue((output0.is_cpu()) and (output1.is_cpu())) + + # Make sure that the reference count is increased by one when DLPack + # representation is created. + rc_before_dlpack_output0 = sys.getrefcount(output0) + rc_before_dlpack_output1 = sys.getrefcount(output1) + + output0_dlpack = output0.to_dlpack() + output1_dlpack = output1.to_dlpack() + + rc_after_dlpack_output0 = sys.getrefcount(output0) + rc_after_dlpack_output1 = sys.getrefcount(output1) + + self.assertEqual(rc_after_dlpack_output0 - rc_before_dlpack_output0, 1) + self.assertEqual(rc_after_dlpack_output1 - rc_before_dlpack_output1, 1) + + # Make sure that reference count decreases after destroying the DLPack + output0_dlpack = None + output1_dlpack = None + rc_after_del_dlpack_output0 = sys.getrefcount(output0) + rc_after_del_dlpack_output1 = sys.getrefcount(output1) + self.assertEqual(rc_after_del_dlpack_output0 - rc_after_dlpack_output0, -1) + self.assertEqual(rc_after_del_dlpack_output1 - rc_after_dlpack_output1, -1) + + return output0.to_dlpack(), output1.to_dlpack() + + def test_zero_length_io(self): + model_name = "identity_fp32" + input0 = np.zeros([1, 0], dtype=np.float32) + input0_pb = pb_utils.Tensor("INPUT0", input0) + infer_request = pb_utils.InferenceRequest( + model_name=model_name, + inputs=[input0_pb], + requested_output_names=["OUTPUT0"], + ) + + if self._is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + + self.assertFalse(infer_response.has_error()) + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + self.assertTrue(np.all(output0 == input0)) + + def cuda_memory_stats(self): + allocated_bytes = torch.cuda.memory_allocated() + reserved_bytes = torch.cuda.memory_reserved() + return allocated_bytes, reserved_bytes + + def bls_tensor_lifecycle_helper(self): + model_name = "dlpack_identity" + verbose = True + + # A 10 MB tensor. + input_size = 10 * 1024 * 1024 + input_type_size_bytes = 4 # TYPE_FP32 + input_size_bytes = input_size * input_type_size_bytes + + # Sending the tensor 50 times to test whether the deallocation is + # happening correctly. If the deallocation doesn't happen correctly, + # there will be an out of shared memory error. + for _ in range(50): + input0 = np.ones([1, input_size], dtype=np.float32) + input0_pb = pb_utils.Tensor("INPUT0", input0) + infer_request = pb_utils.InferenceRequest( + model_name=model_name, + inputs=[input0_pb], + requested_output_names=["OUTPUT0"], + ) + + if self._is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + self.assertFalse(infer_response.has_error()) + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + np.testing.assert_equal( + output0.as_numpy(), input0, "BLS CPU memory lifecycle failed." + ) + + # Show total memory stats before gpu tensor test + print(torch.cuda.memory_summary()) + + # Checking the same with the GPU tensors. + for index in range(50): + input0 = None + infer_request = None + input0_pb = None + fail_msg = f"GPU memory lifecycle test failed at index: {index}" + + torch.cuda.empty_cache() + alloced, cached = self.cuda_memory_stats() + + # Check cuda memory usage is cleaned up (empty) between iterations + # when device tensors go out of scope + self.assertEqual(alloced, 0, fail_msg) + # Check that cache is properly cleaned up when emptied + self.assertEqual(cached, 0, fail_msg) + + if verbose: + # NOTE: this reflects total gpu memory usage, and may be affected + # by other processes, so don't use it for direct checks but log it + # for debugging/context. + free_memory, total_memory = torch.cuda.mem_get_info() + used_memory = total_memory - free_memory + print(f"[DEBUG][Iteration {index}][GPU] {used_memory=} bytes") + + input0 = torch.ones([1, input_size], dtype=torch.float32).to("cuda") + input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0)) + # Check cuda memory usage after creating device tensor + alloced, _ = self.cuda_memory_stats() + self.assertEqual( + alloced, + input_size_bytes, + "Expected precise byte allocation after input tensor creation", + ) + + infer_request = pb_utils.InferenceRequest( + model_name=model_name, + inputs=[input0_pb], + requested_output_names=["OUTPUT0"], + ) + + if self._is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + + self.assertFalse(infer_response.has_error()) + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output0_pytorch = from_dlpack(output0.to_dlpack()) + + # Stats after getting output tensor + alloced, _ = self.cuda_memory_stats() + self.assertEqual( + alloced, + input_size_bytes, + "Expected only input allocation, as output zero-copies input tensor", + ) + + # Set inference response and output0_pytorch to None, to make sure + # that the DLPack is still valid. + output0 = None + infer_response = None + self.assertTrue( + torch.all(output0_pytorch == input0), + f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model.", + ) + + print(torch.cuda.memory_summary()) + + def assert_cuda_memory_empty(self, msg): + torch.cuda.empty_cache() + alloced, cached = self.cuda_memory_stats() + self.assertEqual(alloced, 0, msg) + self.assertEqual(cached, 0, msg) + + def test_bls_tensor_lifecycle(self): + self.assert_cuda_memory_empty("Expected all gpu memory cleaned up before test") + self.bls_tensor_lifecycle_helper() + self.assert_cuda_memory_empty("Expected all gpu memory cleaned up after test") + + def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu, is_decoupled=False): + input0 = torch.rand(16) + input1 = torch.rand(16) + + if is_input0_gpu: + input0 = input0.to("cuda") + + if is_input1_gpu: + input1 = input1.to("cuda") + + input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0)) + input1_pb = pb_utils.Tensor.from_dlpack("INPUT1", to_dlpack(input1)) + + output0_dlpack, output1_dlpack = self._get_gpu_bls_outputs( + input0_pb, input1_pb, is_decoupled=is_decoupled + ) + + expected_output_0 = from_dlpack(input0_pb.to_dlpack()).to("cpu") + from_dlpack( + input1_pb.to_dlpack() + ).to("cpu") + expected_output_1 = from_dlpack(input0_pb.to_dlpack()).to("cpu") - from_dlpack( + input1_pb.to_dlpack() + ).to("cpu") + + self.assertTrue( + torch.all(expected_output_0 == from_dlpack(output0_dlpack).to("cpu")) + ) + self.assertTrue( + torch.all(expected_output_1 == from_dlpack(output1_dlpack).to("cpu")) + ) + + def test_gpu_bls(self): + for input0_device in [True, False]: + for input1_device in [True, False]: + self._test_gpu_bls_add_sub( + input0_device, input1_device, self._is_decoupled + ) + + def test_multiprocess(self): + # Test multiprocess Pool with sync BLS + if self._is_decoupled: + # Fixme: DLIS-4630 + # func_name = bls_square + pass + else: + func_name = bls_add_sub + + pool = Pool(10) + pool.map(func_name, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + pool.close() + pool.join() + + def test_bls_sync(self): + infer_request = pb_utils.InferenceRequest( + model_name="non_existent_model", inputs=[], requested_output_names=[] + ) + + if self._is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + + for infer_response in infer_responses: + # Because the model doesn't exist, the inference response must have an + # error + self.assertTrue(infer_response.has_error()) + self.assertIn( + "Failed for execute the inference request. Model 'non_existent_model' is not ready.", + infer_response.error().message(), + ) + + # Make sure that the inference requests can be performed properly after + # an error. + self.assertTrue(bls_square()) + else: + infer_response = infer_request.exec() + + # Because the model doesn't exist, the inference response must have an + # error + self.assertTrue(infer_response.has_error()) + self.assertIn( + "Failed for execute the inference request. Model 'non_existent_model' is not ready.", + infer_response.error().message(), + ) + + # Make sure that the inference requests can be performed properly after + # an error. + self.assertTrue(bls_add_sub()) + + def test_bls_execute_error(self): + # Test BLS with a model that has an error during execution. + infer_request = pb_utils.InferenceRequest( + model_name="execute_error", inputs=[], requested_output_names=[] + ) + if self._is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + + self.assertTrue(infer_response.has_error()) + self.assertIn( + "expected 1 inputs but got 0 inputs for model 'execute_error'", + infer_response.error().message(), + ) + self.assertTrue(len(infer_response.output_tensors()) == 0) + + def test_multiple_bls(self): + # Test running multiple BLS requests together + if self._is_decoupled: + for _ in range(100): + self.assertTrue(bls_square()) + else: + for _ in range(100): + self.assertTrue(bls_add_sub()) + + def test_timeout(self): + tensor_size = [1, 1024 * 1024] + input0_np = np.random.randn(*tensor_size) + input0 = pb_utils.Tensor("INPUT0", input0_np.astype(np.float32)) + infer_request = pb_utils.InferenceRequest( + model_name="identity_fp32_timeout", + inputs=[input0], + requested_output_names=["OUTPUT0"], + timeout=5, + ) + + if self._is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + else: + infer_response = infer_request.exec() + + # Expect timeout error + self.assertTrue(infer_response.has_error()) + self.assertIn("Request timeout expired", infer_response.error().message()) + self.assertTrue(len(infer_response.output_tensors()) == 0) + + # Verifies two things: + # 1. A request timeout can be accessed by receiver models + # 2. A user can specify a very large value (11s) for a timeout + infer_request = pb_utils.InferenceRequest( + model_name="identity_fp32_timeout", + inputs=[input0], + requested_output_names=["OUTPUT0"], + timeout=11000000000, + ) + + if self._is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + else: + infer_response = infer_request.exec() + + # Expect no timeout error. Check for log message + # in test.sh + self.assertFalse(infer_response.has_error()) + + def _test_response_iterator_square( + self, expected_output_cnt, expected_output_value, response_iterator + ): + response_count = 0 + expected_output_cnt = np.array([expected_output_cnt], dtype=np.int32) + + for infer_response in response_iterator: + self.assertFalse(infer_response.has_error()) + if len(infer_response.output_tensors()) > 0: + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + self.assertIsNotNone(output0) + self.assertEqual(expected_output_value, output0.as_numpy()) + + response_count += 1 + + self.assertEqual(response_count, expected_output_cnt) + + # Make sure the iterator is exhausted. + with self.assertRaises(StopIteration): + next(response_iterator) + + return response_iterator + + def test_response_iterator(self): + if self._is_decoupled: + # Test the response iterator for decoupled responses. The request + # has 4 decoupled responses followed by an empty response. + response_value = 4 + input0_np = np.array([response_value], dtype=np.int32) + input0 = pb_utils.Tensor("IN", input0_np) + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", + inputs=[input0], + requested_output_names=["OUT"], + ) + infer_responses = infer_request.exec(decoupled=True) + + # case 1. Use Next() to get the next response first, then use + # for-loop to get the remaining responses. + infer_response = next(infer_responses) + self.assertFalse(infer_response.has_error()) + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + self.assertIsNotNone(output0) + self.assertEqual(response_value, output0.as_numpy()) + # The iterator now should only have 4 remaining responses. + infer_responses = self._test_response_iterator_square( + 4, response_value, infer_responses + ) + + # case 2. Call for-loop to get all the responses multiple times. + infer_responses = self._test_response_iterator_square( + 5, response_value, infer_responses + ) + infer_responses = self._test_response_iterator_square( + 5, response_value, infer_responses + ) + infer_responses = self._test_response_iterator_square( + 5, response_value, infer_responses + ) + + # case 3. Break from the iteration, then use Next() and for-loop to + # get the remaining responses. + response_count = 0 + for infer_response in infer_responses: + self.assertFalse(infer_response.has_error()) + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + self.assertIsNotNone(output0) + self.assertEqual(response_value, output0.as_numpy()) + + response_count += 1 + if response_count == 2: + break + + infer_response = next(infer_responses) + self.assertFalse(infer_response.has_error()) + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + self.assertIsNotNone(output0) + self.assertEqual(response_value, output0.as_numpy()) + + # The iterator now should only have 2 remaining responses. + infer_responses = self._test_response_iterator_square( + 2, response_value, infer_responses + ) + + # case 4. Delete the iterator before all the responses have been + # retrieved. + infer_responses = infer_request.exec(decoupled=True) + + infer_response = next(infer_responses) + self.assertFalse(infer_response.has_error()) + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + self.assertIsNotNone(output0) + self.assertEqual(response_value, output0.as_numpy()) + + del infer_responses + + def test_preferred_memory(self): + self.assertTrue(bls_libtorch("libtorch_gpu", "CPU")) + self.assertTrue(bls_libtorch("libtorch_cpu", "GPU")) + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + test = unittest.main("model", exit=False) + for test_case, traceback in test.result.failures: + print(f"{test_case} failed:\n{traceback}") + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) + return responses diff --git a/qa/python_models/bls_async/config.pbtxt b/qa/python_models/bls_async/config.pbtxt new file mode 100644 index 0000000000..8e3c7ce2ba --- /dev/null +++ b/qa/python_models/bls_async/config.pbtxt @@ -0,0 +1,38 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_async" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_BOOL + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/bls_async/model.py b/qa/python_models/bls_async/model.py new file mode 100644 index 0000000000..8d75259b7b --- /dev/null +++ b/qa/python_models/bls_async/model.py @@ -0,0 +1,251 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import asyncio +import os + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + + +def verify_add_sub_results(input0, input1, infer_response): + if infer_response.has_error(): + print("Async BLS failed:", infer_response.error().message(), flush=True) + return False + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") + + if (output0 is None) or (output1 is None): + return False + + if not input0.is_cpu(): + input0 = from_dlpack(input0.to_dlpack()).to("cpu").cpu().detach().numpy() + else: + input0 = input0.as_numpy() + + if not input1.is_cpu(): + input1 = from_dlpack(input1.to_dlpack()).to("cpu").cpu().detach().numpy() + else: + input1 = input1.as_numpy() + + if not output0.is_cpu(): + output0 = from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy() + else: + output0 = output0.as_numpy() + + if not output1.is_cpu(): + output1 = from_dlpack(output1.to_dlpack()).to("cpu").cpu().detach().numpy() + else: + output1 = output1.as_numpy() + + expected_output_0 = input0 + input1 + expected_output_1 = input0 - input1 + + if not np.all(expected_output_0 == output0): + print(f"For OUTPUT0 expected {expected_output_0} found {output0}") + return False + + if not np.all(expected_output_1 == output1): + print(f"For OUTPUT1 expected {expected_output_1} found {output1}") + return False + + return True + + +def verify_square_results(input0, infer_responses): + if not input0.is_cpu(): + input0 = from_dlpack(input0.to_dlpack()).to("cpu").cpu().detach().numpy() + else: + input0 = input0.as_numpy() + + response_count = 0 + + for infer_response in infer_responses: + if infer_response.has_error(): + print( + "Async BLS decoupled failed:", + infer_response.error().message(), + flush=True, + ) + return False + + if len(infer_response.output_tensors()) > 0: + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + + if output0 is None: + return False + + if not output0.is_cpu(): + output0 = ( + from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy() + ) + else: + output0 = output0.as_numpy() + + expected_output = input0 + + if not np.all(expected_output == input0): + print(f"For OUT expected {expected_output} found {output0}") + return False + + response_count += 1 + + if not np.all(input0 == response_count - 1): + print("Expected {} responses, got {}".format(input0, response_count - 1)) + return False + + return True + + +def create_addsub_inference_request(gpu=False): + if not gpu: + input0_np = np.random.randn(16) + input1_np = np.random.randn(16) + input0_np = input0_np.astype(np.float32) + input1_np = input1_np.astype(np.float32) + input0 = pb_utils.Tensor("INPUT0", input0_np) + input1 = pb_utils.Tensor("INPUT1", input1_np) + else: + input0_pytorch = torch.rand(16).to("cuda") + input1_pytorch = torch.rand(16).to("cuda") + input0 = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0_pytorch)) + input1 = pb_utils.Tensor.from_dlpack("INPUT1", to_dlpack(input1_pytorch)) + + infer_request = pb_utils.InferenceRequest( + model_name="dlpack_add_sub", + inputs=[input0, input1], + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) + return input0, input1, infer_request + + +def create_square_inference_request(gpu=False): + if not gpu: + input0_np = np.random.randint(16, size=1, dtype=np.int32) + input0 = pb_utils.Tensor("IN", input0_np) + else: + input0_pytorch = torch.randint(1, 16, (1,), dtype=torch.int32).to("cuda") + input0 = pb_utils.Tensor.from_dlpack("IN", to_dlpack(input0_pytorch)) + + infer_request = pb_utils.InferenceRequest( + model_name="dlpack_square", inputs=[input0], requested_output_names=["OUT"] + ) + return input0, infer_request + + +async def async_bls_add_sub(): + input0, input1, infer_request = create_addsub_inference_request() + infer_response = await infer_request.async_exec() + result_correct = verify_add_sub_results(input0, input1, infer_response) + if not result_correct: + return False + + infer_response_sync = infer_request.exec() + result_correct = verify_add_sub_results(input0, input1, infer_response_sync) + if not result_correct: + return False + + return True + + +async def async_bls_square(): + input0, infer_request = create_square_inference_request() + infer_responses = await infer_request.async_exec(decoupled=True) + result_correct = verify_square_results(input0, infer_responses) + if not result_correct: + return False + + infer_responses_sync = infer_request.exec(decoupled=True) + result_correct = verify_square_results(input0, infer_responses_sync) + if not result_correct: + return False + + return True + + +async def multiple_async_bls_addsub(gpu): + infer_request_aws = [] + inputs = [] + for _ in range(10): + input0, input1, infer_request = create_addsub_inference_request(gpu) + inputs.append((input0, input1)) + infer_request_aws.append(infer_request.async_exec()) + + infer_responses = await asyncio.gather(*infer_request_aws) + for infer_response, input_pair in zip(infer_responses, inputs): + result_correct = verify_add_sub_results( + input_pair[0], input_pair[1], infer_response + ) + if not result_correct: + return False + + return True + + +async def multiple_async_bls_square(gpu): + infer_request_aws = [] + inputs = [] + for _ in range(10): + input0, infer_request = create_square_inference_request(gpu) + inputs.append(input0) + infer_request_aws.append(infer_request.async_exec(decoupled=True)) + + async_responses = await asyncio.gather(*infer_request_aws) + for infer_responses, input_pair in zip(async_responses, inputs): + result_correct = verify_square_results(input_pair, infer_responses) + if not result_correct: + return False + + return True + + +class TritonPythonModel: + async def execute(self, requests): + is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False + + responses = [] + for _ in requests: + if is_decoupled: + test1 = await multiple_async_bls_square(gpu=True) + test2 = await multiple_async_bls_square(gpu=False) + test3 = await async_bls_square() + else: + test1 = await multiple_async_bls_addsub(gpu=True) + test2 = await multiple_async_bls_addsub(gpu=False) + test3 = await async_bls_add_sub() + + responses.append( + pb_utils.InferenceResponse( + output_tensors=[ + pb_utils.Tensor("OUTPUT0", np.array([test1 & test2 & test3])) + ] + ) + ) + + return responses diff --git a/qa/python_models/bls_finalize_error/config.pbtxt b/qa/python_models/bls_finalize_error/config.pbtxt new file mode 100644 index 0000000000..ff5f42188b --- /dev/null +++ b/qa/python_models/bls_finalize_error/config.pbtxt @@ -0,0 +1,38 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_finalize_error" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/bls_finalize_error/model.py b/qa/python_models/bls_finalize_error/model.py new file mode 100644 index 0000000000..a38b1080ad --- /dev/null +++ b/qa/python_models/bls_finalize_error/model.py @@ -0,0 +1,45 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + pass + + def execute(self, requests): + pass + + def finalize(self): + print("Cleaning up...") + input0_np = np.random.randint(3, size=1, dtype=np.int32) + input0 = pb_utils.Tensor("IN", input0_np) + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", inputs=[input0], requested_output_names=["OUT"] + ) + infer_responses = infer_request.exec(decoupled=True) diff --git a/qa/python_models/bls_init_error/config.pbtxt b/qa/python_models/bls_init_error/config.pbtxt new file mode 100644 index 0000000000..6cf5024e1f --- /dev/null +++ b/qa/python_models/bls_init_error/config.pbtxt @@ -0,0 +1,38 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_init_error" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/bls_init_error/model.py b/qa/python_models/bls_init_error/model.py new file mode 100644 index 0000000000..b2518e0334 --- /dev/null +++ b/qa/python_models/bls_init_error/model.py @@ -0,0 +1,44 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + input0_np = np.random.randint(3, size=1, dtype=np.int32) + input0 = pb_utils.Tensor("IN", input0_np) + infer_request = pb_utils.InferenceRequest( + model_name="square_int32", inputs=[input0], requested_output_names=["OUT"] + ) + infer_responses = infer_request.exec(decoupled=True) + + def execute(self, requests): + pass + + def finalize(self): + print("Cleaning up...") diff --git a/qa/python_models/bls_memory/config.pbtxt b/qa/python_models/bls_memory/config.pbtxt new file mode 100644 index 0000000000..30c2169f6b --- /dev/null +++ b/qa/python_models/bls_memory/config.pbtxt @@ -0,0 +1,39 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_memory" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] + diff --git a/qa/python_models/bls_memory/model.py b/qa/python_models/bls_memory/model.py new file mode 100644 index 0000000000..69da4f440f --- /dev/null +++ b/qa/python_models/bls_memory/model.py @@ -0,0 +1,103 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import unittest + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class PBBLSMemoryTest(unittest.TestCase): + def setUp(self): + self._is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False + + def _send_identity_tensor(self, size, is_decoupled): + tensor_size = [1, size] + input0_np = np.random.randn(*tensor_size) + input0 = pb_utils.Tensor("INPUT0", input0_np.astype(np.float32)) + infer_request = pb_utils.InferenceRequest( + model_name="identity_fp32", + inputs=[input0], + requested_output_names=["OUTPUT0"], + ) + + if is_decoupled: + infer_responses = infer_request.exec(decoupled=True) + infer_response = next(infer_responses) + with self.assertRaises(StopIteration): + next(infer_responses) + else: + infer_response = infer_request.exec() + + return input0_np, infer_response + + def test_bls_out_of_memory(self): + tensor_size = 256 * 1024 * 1024 + input0_np, infer_response = self._send_identity_tensor( + tensor_size, self._is_decoupled + ) + out_of_memory_message = "Failed to increase the shared memory pool size for key" + + if infer_response.has_error(): + self.assertIn(out_of_memory_message, infer_response.error().message()) + else: + self.assertFalse(infer_response.has_error()) + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + self.assertIsNotNone(output0) + self.assertTrue(np.allclose(output0.as_numpy(), input0_np)) + + tensor_size = 50 * 1024 * 1024 + for _ in range(4): + input0_np, infer_response = self._send_identity_tensor( + tensor_size, self._is_decoupled + ) + if infer_response.has_error(): + self.assertIn(out_of_memory_message, infer_response.error().message()) + else: + self.assertFalse(infer_response.has_error()) + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + self.assertIsNotNone(output0) + self.assertTrue(np.allclose(output0.as_numpy(), input0_np)) + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + test = unittest.main("model", exit=False) + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) + return responses diff --git a/qa/python_models/bls_memory_async/config.pbtxt b/qa/python_models/bls_memory_async/config.pbtxt new file mode 100644 index 0000000000..66bfcd3bbf --- /dev/null +++ b/qa/python_models/bls_memory_async/config.pbtxt @@ -0,0 +1,39 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_memory_async" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] + diff --git a/qa/python_models/bls_memory_async/model.py b/qa/python_models/bls_memory_async/model.py new file mode 100644 index 0000000000..d9e676b42e --- /dev/null +++ b/qa/python_models/bls_memory_async/model.py @@ -0,0 +1,98 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os + +import numpy as np +import triton_python_backend_utils as pb_utils + + +async def _send_identity_tensor(size, is_decoupled): + tensor_size = [1, size] + input0_np = np.random.randn(*tensor_size) + input0 = pb_utils.Tensor("INPUT0", input0_np.astype(np.float32)) + infer_request = pb_utils.InferenceRequest( + model_name="identity_fp32", inputs=[input0], requested_output_names=["OUTPUT0"] + ) + + if is_decoupled: + infer_responses = await infer_request.async_exec(decoupled=True) + infer_response = next(infer_responses) + else: + infer_response = await infer_request.async_exec() + + return input0_np, infer_response + + +async def test_bls_out_of_memory(): + is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False + + tensor_size = 256 * 1024 * 1024 + input0_np, infer_response = await _send_identity_tensor(tensor_size, is_decoupled) + + out_of_memory_message = "Failed to increase the shared memory pool size for key" + + if infer_response.has_error(): + if not (out_of_memory_message in infer_response.error().message()): + return False + else: + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + if output0 is None: + return False + if not np.allclose(output0.as_numpy(), input0_np): + return False + + tensor_size = 50 * 1024 * 1024 + for _ in range(4): + input0_np, infer_response = await _send_identity_tensor( + tensor_size, is_decoupled + ) + + if infer_response.has_error(): + if not (out_of_memory_message in infer_response.error().message()): + return False + else: + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + if output0 is None: + return False + if not np.allclose(output0.as_numpy(), input0_np): + return False + + return True + + +class TritonPythonModel: + async def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + result = await test_bls_out_of_memory() + responses.append( + pb_utils.InferenceResponse( + [pb_utils.Tensor("OUTPUT0", np.array([result], dtype=np.float16))] + ) + ) + return responses diff --git a/qa/python_models/bls_model_loading/config.pbtxt b/qa/python_models/bls_model_loading/config.pbtxt new file mode 100644 index 0000000000..2099ba5db7 --- /dev/null +++ b/qa/python_models/bls_model_loading/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_model_loading" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_BOOL + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/qa/python_models/bls_model_loading/model.py b/qa/python_models/bls_model_loading/model.py new file mode 100644 index 0000000000..84162e2fac --- /dev/null +++ b/qa/python_models/bls_model_loading/model.py @@ -0,0 +1,135 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import unittest + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class PBBLSModelLoadingTest(unittest.TestCase): + def setUp(self): + self.model_name = "onnx_int32_int32_int32" + + def tearDown(self): + # The unload call does not wait for the requested model to be fully + # unloaded before returning. + pb_utils.unload_model(self.model_name) + # TODO: Make this more robust to wait until fully unloaded + print("Sleep 30 seconds to make sure model finishes unloading...") + time.sleep(30) + print("Done sleeping.") + + def test_load_unload_model(self): + self.assertFalse(pb_utils.is_model_ready(model_name=self.model_name)) + pb_utils.load_model(model_name=self.model_name) + self.assertTrue(pb_utils.is_model_ready(self.model_name)) + pb_utils.unload_model(self.model_name) + self.assertFalse(pb_utils.is_model_ready(self.model_name)) + + def test_load_with_config_override(self): + self.assertFalse(pb_utils.is_model_ready(self.model_name)) + pb_utils.load_model(self.model_name) + self.assertTrue(pb_utils.is_model_ready(self.model_name)) + + # Send the config with the wrong format + wrong_config = '"parameters": {"config": {{"backend":"onnxruntime", "version_policy":{"specific":{"versions":[2]}}}}}' + with self.assertRaises(pb_utils.TritonModelException): + pb_utils.load_model(model_name=self.model_name, config=wrong_config) + # The model should not be changed after a failed load model request + for version in ["2", "3"]: + self.assertTrue( + pb_utils.is_model_ready( + model_name=self.model_name, model_version=version + ) + ) + + # Send the config with the correct format + config = ( + '{"backend":"onnxruntime", "version_policy":{"specific":{"versions":[2]}}}' + ) + pb_utils.load_model(self.model_name, config=config) + # The model should be changed after a successful load model request + self.assertTrue(pb_utils.is_model_ready(self.model_name, "2")) + self.assertFalse(pb_utils.is_model_ready(self.model_name, "3")) + + def test_load_with_file_override(self): + self.assertFalse(pb_utils.is_model_ready(self.model_name)) + pb_utils.load_model(self.model_name) + self.assertTrue(pb_utils.is_model_ready(self.model_name)) + + override_name = "override_model" + config = '{"backend":"onnxruntime"}' + with open("models/onnx_int32_int32_int32/3/model.onnx", "rb") as file: + data = file.read() + files = {"file:1/model.onnx": data} + + # Request to load the model with override file, should fail without + # providing override config. + with self.assertRaises(pb_utils.TritonModelException): + pb_utils.load_model(self.model_name, "", files) + + # Request to load the model with override file and config in a different name + pb_utils.load_model(model_name=override_name, config=config, files=files) + # Sanity check that the model with original name is unchanged + self.assertFalse(pb_utils.is_model_ready(self.model_name, "1")) + self.assertTrue(pb_utils.is_model_ready(self.model_name, "3")) + + # Check the override model readiness + self.assertTrue(pb_utils.is_model_ready(override_name, "1")) + self.assertFalse(pb_utils.is_model_ready(override_name, "3")) + + # Request to load the model with override file and config in original name + pb_utils.load_model(self.model_name, config, files) + # Check that the model with original name is changed + self.assertTrue(pb_utils.is_model_ready(self.model_name, "1")) + self.assertFalse(pb_utils.is_model_ready(self.model_name, "3")) + + # Sanity check readiness of the different named model + self.assertTrue(pb_utils.is_model_ready(override_name, "1")) + self.assertFalse(pb_utils.is_model_ready(override_name, "3")) + + +class TritonPythonModel: + def initialize(self, args): + # Run the unittest during initialization + test = unittest.main("model", exit=False) + self.result = test.result.wasSuccessful() + + def execute(self, requests): + responses = [] + for _ in requests: + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", np.array([self.result], dtype=np.float16) + ) + ] + ) + ) + return responses diff --git a/qa/python_models/bls_onnx_warmup/config.pbtxt b/qa/python_models/bls_onnx_warmup/config.pbtxt new file mode 100644 index 0000000000..879f85ca81 --- /dev/null +++ b/qa/python_models/bls_onnx_warmup/config.pbtxt @@ -0,0 +1,38 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_onnx_warmup" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] \ No newline at end of file diff --git a/qa/python_models/bls_onnx_warmup/model.py b/qa/python_models/bls_onnx_warmup/model.py new file mode 100644 index 0000000000..233bdc85ab --- /dev/null +++ b/qa/python_models/bls_onnx_warmup/model.py @@ -0,0 +1,88 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import numpy as np +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack + + +class PBBLSONNXWarmupTest(unittest.TestCase): + def test_onnx_output_mem_type(self): + input0_np = np.random.randn(*[16]) + input0_np = input0_np.astype(np.float32) + input1_np = np.random.randn(*[16]) + input1_np = input1_np.astype(np.float32) + input0 = pb_utils.Tensor("INPUT0", input0_np) + input1 = pb_utils.Tensor("INPUT1", input1_np) + infer_request = pb_utils.InferenceRequest( + model_name="onnx_nobatch_float32_float32_float32", + inputs=[input0, input1], + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) + + infer_response = infer_request.exec() + + self.assertFalse(infer_response.has_error()) + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") + + self.assertIsNotNone(output0) + self.assertIsNotNone(output1) + + # The memory type of output tensor should be GPU + self.assertFalse(output0.is_cpu()) + self.assertFalse(output1.is_cpu()) + + expected_output_0 = input0.as_numpy() - input1.as_numpy() + expected_output_1 = input0.as_numpy() + input1.as_numpy() + + output0 = from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy() + output1 = from_dlpack(output1.to_dlpack()).to("cpu").cpu().detach().numpy() + + self.assertTrue(np.all(output0 == expected_output_0)) + self.assertTrue(np.all(output1 == expected_output_1)) + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + test = unittest.main("model", exit=False) + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) + return responses diff --git a/qa/python_models/bls_parameters/config.pbtxt b/qa/python_models/bls_parameters/config.pbtxt new file mode 100644 index 0000000000..dddf300185 --- /dev/null +++ b/qa/python_models/bls_parameters/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_parameters" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "NUMBER_PARAMETERS" + data_type: TYPE_UINT8 + dims: [ 1 ] + } +] + +output [ + { + name: "PARAMETERS_AGGREGATED" + data_type: TYPE_STRING + dims: [ 1 ] + } +] + +instance_group [ + { + count: 4 + kind: KIND_CPU + } +] diff --git a/qa/python_models/bls_parameters/model.py b/qa/python_models/bls_parameters/model.py new file mode 100644 index 0000000000..5dc54ebffd --- /dev/null +++ b/qa/python_models/bls_parameters/model.py @@ -0,0 +1,77 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + + for request in requests: + num_params = int( + pb_utils.get_input_tensor_by_name( + request, "NUMBER_PARAMETERS" + ).as_numpy()[0] + ) + params = json.loads(request.parameters()) + + if num_params == 0: + # Base case where the received parameters are returned as JSON + response = json.dumps(params) + response_tensors = [ + pb_utils.Tensor( + "PARAMETERS_AGGREGATED", np.array([response], dtype=np.object_) + ) + ] + else: + # Add the parameters of num_params step to the received parameters + params["bool_" + str(num_params)] = bool(num_params) + params["int_" + str(num_params)] = num_params + params["str_" + str(num_params)] = str(num_params) + # Complete any remaining steps [1, num_params - 1] by calling self + # recursively via BLS + bls_request_tensor = pb_utils.Tensor( + "NUMBER_PARAMETERS", np.array([num_params - 1], dtype=np.ubyte) + ) + bls_request = pb_utils.InferenceRequest( + model_name="bls_parameters", + inputs=[bls_request_tensor], + requested_output_names=["PARAMETERS_AGGREGATED"], + parameters=params, + ) + bls_response = bls_request.exec() + response_tensors = bls_response.output_tensors() + + inference_response = pb_utils.InferenceResponse( + output_tensors=response_tensors + ) + responses.append(inference_response) + + return responses diff --git a/qa/python_models/bls_request_rescheduling/config.pbtxt b/qa/python_models/bls_request_rescheduling/config.pbtxt new file mode 100644 index 0000000000..84f8658f7f --- /dev/null +++ b/qa/python_models/bls_request_rescheduling/config.pbtxt @@ -0,0 +1,38 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_request_rescheduling" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/bls_request_rescheduling/model.py b/qa/python_models/bls_request_rescheduling/model.py new file mode 100644 index 0000000000..8615622af9 --- /dev/null +++ b/qa/python_models/bls_request_rescheduling/model.py @@ -0,0 +1,133 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import unittest + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class RequestReschedulingTest(unittest.TestCase): + def _reload_model(self, model_name): + # Reload the model to reset the flag for multiple iterations + pb_utils.unload_model(model_name) + # TODO: Make this more robust to wait until fully unloaded + print("Sleep 10 seconds to make sure model finishes unloading...", flush=True) + time.sleep(10) + print("Done sleeping.", flush=True) + pb_utils.load_model(model_name) + + def test_wrong_return_type(self): + input0 = pb_utils.Tensor("INPUT0", (np.random.randn(*[4])).astype(np.float32)) + infer_request = pb_utils.InferenceRequest( + model_name="wrong_return_type", + inputs=[input0], + requested_output_names=["OUTPUT0"], + ) + + infer_response = infer_request.exec() + self.assertTrue(infer_response.has_error()) + self.assertIn( + "Expected a None object in the execute function return list for reschduled request", + infer_response.error().message(), + ) + + def test_non_decoupled_e2e(self): + model_name = "request_rescheduling_addsub" + self._reload_model(model_name) + + input0_np = np.random.randn(*[16]) + input0_np = input0_np.astype(np.float32) + input1_np = np.random.randn(*[16]) + input1_np = input1_np.astype(np.float32) + input0 = pb_utils.Tensor("INPUT0", input0_np) + input1 = pb_utils.Tensor("INPUT1", input1_np) + infer_request = pb_utils.InferenceRequest( + model_name=model_name, + inputs=[input0, input1], + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) + infer_response = infer_request.exec() + + self.assertFalse(infer_response.has_error()) + + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") + + self.assertIsNotNone(output0) + self.assertIsNotNone(output1) + + expected_output_0 = input0.as_numpy() + input1.as_numpy() + expected_output_1 = input0.as_numpy() - input1.as_numpy() + + self.assertEqual(expected_output_0[0], output0.as_numpy()[0]) + self.assertEqual(expected_output_1[0], output1.as_numpy()[0]) + + def test_decoupled_e2e(self): + model_name = "iterative_sequence" + self._reload_model(model_name) + + input_value = 3 + input0 = pb_utils.Tensor("IN", np.array([input_value], dtype=np.int32)) + infer_request = pb_utils.InferenceRequest( + model_name=model_name, + inputs=[input0], + requested_output_names=["OUT"], + ) + infer_responses = infer_request.exec(decoupled=True) + + expected_output = input_value - 1 + + if infer_responses: + for infer_response in infer_responses: + self.assertFalse(infer_response.has_error()) + + if len(infer_response.output_tensors()) > 0: + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") + self.assertIsNotNone(output0) + + self.assertEqual(expected_output, output0.as_numpy()[0]) + expected_output -= 1 + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + test = unittest.main("model", exit=False) + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) + return responses diff --git a/qa/python_models/bls_simple/bls_simple.py b/qa/python_models/bls_simple/bls_simple.py new file mode 100644 index 0000000000..962c3834b9 --- /dev/null +++ b/qa/python_models/bls_simple/bls_simple.py @@ -0,0 +1,84 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + inputs = [ + {"name": "MODEL_NAME", "data_type": "TYPE_STRING", "dims": [1]}, + {"name": "INPUT0", "data_type": "TYPE_INT32", "dims": [1, 16]}, + {"name": "INPUT1", "data_type": "TYPE_INT32", "dims": [1, 16]}, + ] + outputs = [ + {"name": "OUTPUT0", "data_type": "TYPE_INT32", "dims": [16]}, + {"name": "OUTPUT1", "data_type": "TYPE_INT32", "dims": [16]}, + ] + + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + for input in config["input"]: + input_names.append(input["name"]) + for output in config["output"]: + output_names.append(output["name"]) + + for input in inputs: + if input["name"] not in input_names: + auto_complete_model_config.add_input(input) + for output in outputs: + if output["name"] not in output_names: + auto_complete_model_config.add_output(output) + + auto_complete_model_config.set_max_batch_size(0) + + return auto_complete_model_config + + def execute(self, requests): + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + model_name = pb_utils.get_input_tensor_by_name(request, "MODEL_NAME") + model_name_string = model_name.as_numpy()[0] + + infer_request = pb_utils.InferenceRequest( + model_name=model_name_string, + requested_output_names=["OUTPUT0", "OUTPUT1"], + inputs=[in_0, in_1], + trace=request.trace(), + ) + + infer_response = infer_request.exec() + + inference_response = pb_utils.InferenceResponse( + output_tensors=infer_response.output_tensors() + ) + responses.append(inference_response) + + return responses diff --git a/qa/python_models/bls_undefined/config.pbtxt b/qa/python_models/bls_undefined/config.pbtxt new file mode 100644 index 0000000000..ab873d8a64 --- /dev/null +++ b/qa/python_models/bls_undefined/config.pbtxt @@ -0,0 +1,50 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "bls_undefined" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] + +instance_group [{ + kind: KIND_CPU, + count: 2 +}] + diff --git a/qa/python_models/bls_undefined/model.py b/qa/python_models/bls_undefined/model.py new file mode 100644 index 0000000000..30e5f4106a --- /dev/null +++ b/qa/python_models/bls_undefined/model.py @@ -0,0 +1,33 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + def execute(self, requests): + undefined_variable + + def finalize(self): + print("Cleaning up...") diff --git a/qa/python_models/busy_op/config.pbtxt b/qa/python_models/busy_op/config.pbtxt new file mode 100644 index 0000000000..27f9003ab7 --- /dev/null +++ b/qa/python_models/busy_op/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "busy_op" +backend: "python" +max_batch_size: 1 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/busy_op/model.py b/qa/python_models/busy_op/model.py new file mode 100644 index 0000000000..a68343881b --- /dev/null +++ b/qa/python_models/busy_op/model.py @@ -0,0 +1,49 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """ + This model calls sleep for the first request in order to force requests to + sit in the queue, and result in memory growth. + """ + + def initialize(self, args): + self.sleep = True + + def execute(self, requests): + if self.sleep: + time.sleep(50) + self.sleep = False + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/cuda_memory_consumer/1/model.py b/qa/python_models/cuda_memory_consumer/1/model.py new file mode 100644 index 0000000000..e3526920ea --- /dev/null +++ b/qa/python_models/cuda_memory_consumer/1/model.py @@ -0,0 +1,69 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils +from cuda import cuda + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input = {"name": "INPUT", "data_type": "TYPE_FP32", "dims": [1]} + output = {"name": "OUTPUT", "data_type": "TYPE_FP32", "dims": [1]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input) + auto_complete_model_config.add_output(output) + + return auto_complete_model_config + + def initialize(self, args): + self.mem_ptr = None + # Initialize CUDA context + cuda.cuInit(0) + cuda.cuCtxCreate(0, 0) + + mem_info = cuda.cuMemGetInfo() + if mem_info[0] != 0: + raise pb_utils.TritonModelException("Failed to get CUDA memory info") + + mem_alloc = cuda.cuMemAlloc(mem_info[2] * 0.4) + if mem_alloc[0] != 0: + raise pb_utils.TritonModelException("Failed to allocate CUDA memory") + self.mem_ptr = mem_alloc[1] + + def finalize(self): + if self.mem_ptr is not None: + cuda.cuMemFree(self.mem_ptr) + + def execute(self, requests): + """This function is called on inference request.""" + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/cuda_memory_consumer/config.pbtxt b/qa/python_models/cuda_memory_consumer/config.pbtxt new file mode 100644 index 0000000000..b1e0348433 --- /dev/null +++ b/qa/python_models/cuda_memory_consumer/config.pbtxt @@ -0,0 +1,28 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +instance_group [{ kind: KIND_GPU, gpus: [0] }] diff --git a/qa/python_models/custom_metrics/config.pbtxt b/qa/python_models/custom_metrics/config.pbtxt new file mode 100644 index 0000000000..c2bf81331b --- /dev/null +++ b/qa/python_models/custom_metrics/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "custom_metrics" +backend: "python" + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [ + { + count: 3 + kind: KIND_CPU + } +] diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py new file mode 100644 index 0000000000..7c78b46894 --- /dev/null +++ b/qa/python_models/custom_metrics/model.py @@ -0,0 +1,416 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import numpy as np +import requests +import triton_python_backend_utils as pb_utils + + +class PBCustomMetricsTest(unittest.TestCase): + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metric_api_helper(self, metric, kind): + # Adding logger to test if custom metrics and logging work together + # as they use the same message queue. + logger = pb_utils.Logger + + # The value should be 0.0 before the test + self.assertEqual(metric.value(), 0.0) + + # Test increment positive value + increment = 2023.0 + metric.increment(increment) + self.assertEqual(metric.value(), increment) + logger.log_info("Incremented metric to : {}".format(metric.value())) + + # Test increment negative value + decrement = -23.5 + if kind == "counter": + # Counter should not accept negative values + with self.assertRaises(pb_utils.TritonModelException): + metric.increment(decrement) + else: + metric.increment(decrement) + self.assertEqual(metric.value(), increment + decrement) + logger.log_info("Decremented metric to : {}".format(metric.value())) + + # Test set value + value = 999.9 + if kind == "counter": + # Counter does not support set + with self.assertRaises(pb_utils.TritonModelException): + metric.set(value) + else: + metric.set(value) + self.assertEqual(metric.value(), value) + logger.log_info("Set metric to : {}".format(metric.value())) + + # Test observe value + observe = 0.05 + # Counter and gauge do not support observe + with self.assertRaises(pb_utils.TritonModelException): + metric.observe(observe) + + def _histogram_api_helper(self, metric, name, labels): + def histogram_str_builder(name, type, labels, value, le=None): + if type == "count" or type == "sum": + return f"{name}_{type}{{{labels}}} {value}" + elif type == "bucket": + return f'{name}_bucket{{{labels},le="{le}"}} {value}' + else: + raise + + # Adding logger to test if custom metrics and logging work together + # as they use the same message queue. + logger = pb_utils.Logger + + # All values should be 0.0 before the test + metrics = self._get_metrics() + self.assertIn(histogram_str_builder(name, "count", labels, "0"), metrics) + self.assertIn(histogram_str_builder(name, "sum", labels, "0"), metrics) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="0.1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="2.5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="10"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "0", le="+Inf"), metrics + ) + + # Histogram does not support value + with self.assertRaises(pb_utils.TritonModelException): + metric.value() + + # Test increment value + increment = 2023.0 + # Histogram does not support increment + with self.assertRaises(pb_utils.TritonModelException): + metric.increment(increment) + + # Test set value + value = 999.9 + # Histogram does not support set + with self.assertRaises(pb_utils.TritonModelException): + metric.set(value) + + # Test observe value + data = [0.05, 1.5, 6.0] + for datum in data: + metric.observe(datum) + logger.log_info("Observe histogram metric with value : {}".format(datum)) + + metrics = self._get_metrics() + self.assertIn( + histogram_str_builder(name, "count", labels, str(len(data))), metrics + ) + self.assertIn( + histogram_str_builder(name, "sum", labels, str(sum(data))), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "1", le="0.1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "1", le="1"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "2", le="2.5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "2", le="5"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "3", le="10"), metrics + ) + self.assertIn( + histogram_str_builder(name, "bucket", labels, "3", le="+Inf"), metrics + ) + + def _dup_metric_helper(self, labels={}): + # Adding logger to test if custom metrics and logging work together + # as they use the same message queue. + logger = pb_utils.Logger + + description = "dup metric" + metric_family = pb_utils.MetricFamily( + name="test_dup_metric", + description=description, + kind=pb_utils.MetricFamily.COUNTER, + ) + + # Verify dupe metrics reference same underlying metric + metric1 = metric_family.Metric(labels=labels) + metric2 = metric_family.Metric(labels=labels) + + # The value should be 0 before the test + self.assertEqual(metric1.value(), 0.0) + self.assertEqual(metric2.value(), 0.0) + + # Increment metric 1, check metric 2 == metric 1 + increment = 7.5 + metric1.increment(increment) + self.assertEqual(metric1.value(), metric2.value()) + logger.log_info("Incremented metric1 to : {}".format(metric1.value())) + logger.log_info("Incremented metric2 to : {}".format(metric2.value())) + + # Assert custom metric/family remains when there's still a reference to it + del metric1 + metrics = self._get_metrics() + self.assertIn(description, metrics) + + def test_counter_e2e(self): + metric_family = pb_utils.MetricFamily( + name="test_counter_e2e", + description="test metric counter kind end to end", + kind=pb_utils.MetricFamily.COUNTER, + ) + labels = {"example1": "counter_label1", "example2": "counter_label2"} + metric = metric_family.Metric(labels=labels) + self._metric_api_helper(metric, "counter") + + pattern = ( + 'test_counter_e2e{example1="counter_label1",example2="counter_label2"}' + ) + metrics = self._get_metrics() + self.assertIn(pattern, metrics) + + def test_gauge_e2e(self): + metric_family = pb_utils.MetricFamily( + name="test_gauge_e2e", + description="test metric gauge kind end to end", + kind=pb_utils.MetricFamily.GAUGE, + ) + labels = {"example1": "gauge_label1", "example2": "gauge_label2"} + metric = metric_family.Metric(labels=labels) + self._metric_api_helper(metric, "gauge") + + pattern = 'test_gauge_e2e{example1="gauge_label1",example2="gauge_label2"}' + metrics = self._get_metrics() + self.assertIn(pattern, metrics) + + def test_histogram_e2e(self): + name = "test_histogram_e2e" + metric_family = pb_utils.MetricFamily( + name=name, + description="test metric histogram kind end to end", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + + labels = {"example1": "histogram_label1", "example2": "histogram_label2"} + buckets = [0.1, 1.0, 2.5, 5.0, 10.0] + metric = metric_family.Metric(labels=labels, buckets=buckets) + + labels_str = 'example1="histogram_label1",example2="histogram_label2"' + self._histogram_api_helper(metric, name, labels_str) + + metrics = self._get_metrics() + count_pattern = f"{name}_count{{{labels_str}}}" + sum_pattern = f"{name}_sum{{{labels_str}}}" + bucket_pattern = f"{name}_bucket{{{labels_str}" + self.assertEqual(metrics.count(count_pattern), 1) + self.assertEqual(metrics.count(sum_pattern), 1) + self.assertEqual(metrics.count(bucket_pattern), len(buckets) + 1) + + def test_histogram_args(self): + name = "test_histogram_args" + metric_family = pb_utils.MetricFamily( + name=name, + description="test metric histogram args", + kind=pb_utils.MetricFamily.HISTOGRAM, + ) + + # Test "None" value buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}) + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=None) + + # Test non-ascending order buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=[2.5, 0.1, 1.0, 10.0, 5.0]) + + # Test duplicate value buckets + with self.assertRaises(pb_utils.TritonModelException): + metric_family.Metric(labels={}, buckets=[1, 1, 2, 5, 5]) + + # Test empty list bucket + metric_family.Metric(labels={}, buckets=[]) + + def test_dup_metric_family_diff_kind(self): + # Test that a duplicate metric family can't be added with a conflicting type/kind + metric_family1 = pb_utils.MetricFamily( + name="test_dup_metric_family_diff_kind", + description="test metric family with same name but different kind", + kind=pb_utils.MetricFamily.COUNTER, + ) + with self.assertRaises(pb_utils.TritonModelException): + metric_family2 = pb_utils.MetricFamily( + name="test_dup_metric_family_diff_kind", + description="test metric family with same name but different kind", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.assertIsNone(metric_family2) + + self.assertIsNotNone(metric_family1) + + def test_dup_metric_family_diff_description(self): + # Test that a duplicate metric family name will still return the + # original metric family even if the description is changed + metric_family1 = pb_utils.MetricFamily( + name="test_dup_metric_family_diff_description", + description="first description", + kind=pb_utils.MetricFamily.COUNTER, + ) + metric_family2 = pb_utils.MetricFamily( + name="test_dup_metric_family_diff_description", + description="second description", + kind=pb_utils.MetricFamily.COUNTER, + ) + + metric2 = metric_family2.Metric() + self.assertEqual(metric2.value(), 0) + + # Delete metric_family1 and check if metric_family2 still references it + del metric_family1 + pattern = "test_dup_metric_family_diff_description first description" + metrics = self._get_metrics() + self.assertIn(pattern, metrics) + + # The first description will be kept if adding a duplicate metric + # family name with a different description + pattern = "test_dup_metric_family_diff_description second description" + self.assertNotIn(pattern, metrics) + + def test_dup_metric_family(self): + # Test that adding a duplicate metric family will reuse the original + # and not add another entry to registry + metric_family1 = pb_utils.MetricFamily( + name="test_dup_metric_family", + description="dup description", + kind=pb_utils.MetricFamily.COUNTER, + ) + metric_family2 = pb_utils.MetricFamily( + name="test_dup_metric_family", + description="dup description", + kind=pb_utils.MetricFamily.COUNTER, + ) + + metric_key = "custom_metric_key" + metric1 = metric_family1.Metric(labels={metric_key: "label1"}) + metric2 = metric_family2.Metric(labels={metric_key: "label2"}) + + self.assertEqual(metric1.value(), 0) + self.assertEqual(metric2.value(), 0) + + patterns = [ + "# HELP test_dup_metric_family dup description", + "# TYPE test_dup_metric_family counter", + 'test_dup_metric_family{custom_metric_key="label2"} 0', + 'test_dup_metric_family{custom_metric_key="label1"} 0', + ] + metrics = self._get_metrics() + for pattern in patterns: + self.assertIn(pattern, metrics) + + def test_dup_metric_labels(self): + # Test that adding a duplicate metric will refer to the same + # underlying metric, and all instances will be updated + labels = {"example1": "label1", "example2": "label2"} + self._dup_metric_helper(labels) + + def test_dup_metric_empty_labels(self): + # Test that adding a duplicate metric will refer to the same + # underlying metric, and all instances will be updated + self._dup_metric_helper() + + def test_metric_lifetime_error(self): + # Test the error handling when the corresponding 'MetricFamily' is + # deleted before the 'Metric' is deleted, and the 'Metric' is still + # being used for metric operations + kinds = [pb_utils.MetricFamily.COUNTER, pb_utils.MetricFamily.GAUGE] + metric_family_names = [ + "test_metric_lifetime_error_counter", + "test_metric_lifetime_error_gauge", + ] + for kind, name in zip(kinds, metric_family_names): + metric_family = pb_utils.MetricFamily( + name=name, description="test metric lifetime error", kind=kind + ) + labels = {"example1": "counter_label1", "example2": "counter_label2"} + metric = metric_family.Metric(labels=labels) + + # Intentionally delete the 'MetricFamily' before the 'Metric' being deleted + del metric_family + + error_msg = "Invalid metric operation as the corresponding 'MetricFamily' has been deleted." + + # Counter does not support set + if kind is not pb_utils.MetricFamily.COUNTER: + with self.assertRaises(pb_utils.TritonModelException) as ex: + metric.set(10) + self.assertIn(error_msg, str(ex.exception)) + + with self.assertRaises(pb_utils.TritonModelException) as ex: + metric.increment(10) + self.assertIn(error_msg, str(ex.exception)) + + with self.assertRaises(pb_utils.TritonModelException) as ex: + metric.value() + self.assertIn(error_msg, str(ex.exception)) + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + test = unittest.main("model", exit=False) + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) + return responses diff --git a/qa/python_models/delayed_model/config.pbtxt b/qa/python_models/delayed_model/config.pbtxt new file mode 100644 index 0000000000..493a3c8fdb --- /dev/null +++ b/qa/python_models/delayed_model/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "delayed_model" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/delayed_model/model.py b/qa/python_models/delayed_model/model.py new file mode 100644 index 0000000000..e7538148f1 --- /dev/null +++ b/qa/python_models/delayed_model/model.py @@ -0,0 +1,45 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time + +import triton_python_backend_utils as pb_utils + +# Sleep for 5 seconds to ensure that delayed startup works properly. +time.sleep(5) + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses + + def finalize(self): + pass diff --git a/qa/python_models/dlpack_add_sub/config.pbtxt b/qa/python_models/dlpack_add_sub/config.pbtxt new file mode 100644 index 0000000000..2a2a5f8694 --- /dev/null +++ b/qa/python_models/dlpack_add_sub/config.pbtxt @@ -0,0 +1,66 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_add_sub" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{kind : KIND_CPU}] + +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value:"no" + } +} diff --git a/qa/python_models/dlpack_add_sub/model.py b/qa/python_models/dlpack_add_sub/model.py new file mode 100644 index 0000000000..7f70e05d5c --- /dev/null +++ b/qa/python_models/dlpack_add_sub/model.py @@ -0,0 +1,130 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + self.numpy_to_pytorch_dtype = { + np.bool_: torch.bool, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, + } + + def execute(self, requests): + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + # If both of the tensors are in CPU, use NumPy. + if in_0.is_cpu() and in_1.is_cpu(): + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) + + in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) + - in_1.as_numpy().astype(np.int32), + ) + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", out_0.astype(output0_dtype) + ) + out_tensor_1 = pb_utils.Tensor( + "OUTPUT1", out_1.astype(output1_dtype) + ) + else: + in_0_pytorch, in_1_pytorch = from_dlpack( + in_0.to_dlpack() + ), from_dlpack(in_1.to_dlpack()) + out_0, out_1 = ( + in_0_pytorch + in_1_pytorch, + in_0_pytorch - in_1_pytorch, + ) + + if self.output0_dtype == np.object_: + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", out_0.numpy().astype(output0_dtype) + ) + else: + out_0 = out_0.type(self.numpy_to_pytorch_dtype[output0_dtype]) + out_tensor_0 = pb_utils.Tensor.from_dlpack( + "OUTPUT0", to_dlpack(out_0) + ) + + if self.output1_dtype == np.object_: + out_tensor_1 = pb_utils.Tensor( + "OUTPUT1", out_1.numpy().astype(output1_dtype) + ) + else: + out_1 = out_1.type(self.numpy_to_pytorch_dtype[output1_dtype]) + out_tensor_1 = pb_utils.Tensor.from_dlpack( + "OUTPUT1", to_dlpack(out_1) + ) + + else: + in_0_pytorch, in_1_pytorch = ( + from_dlpack(in_0.to_dlpack()).cuda(), + from_dlpack(in_1.to_dlpack()).cuda(), + ) + out_0, out_1 = ( + in_0_pytorch + in_1_pytorch, + in_0_pytorch - in_1_pytorch, + ) + out_tensor_0 = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(out_0)) + out_tensor_1 = pb_utils.Tensor.from_dlpack("OUTPUT1", to_dlpack(out_1)) + + responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) + + return responses diff --git a/qa/python_models/dlpack_empty_output/config.pbtxt b/qa/python_models/dlpack_empty_output/config.pbtxt new file mode 100644 index 0000000000..d026db1cd1 --- /dev/null +++ b/qa/python_models/dlpack_empty_output/config.pbtxt @@ -0,0 +1,43 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_empty_output" +max_batch_size: 8 + +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] diff --git a/qa/python_models/dlpack_empty_output/model.py b/qa/python_models/dlpack_empty_output/model.py new file mode 100644 index 0000000000..7784e28b4d --- /dev/null +++ b/qa/python_models/dlpack_empty_output/model.py @@ -0,0 +1,53 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack + + +class TritonPythonModel: + def initialize(self, args): + pass + + def execute(self, requests): + responses = [] + + for _ in requests: + SHAPE = (0,) + + pytorch_tensor = torch.ones(SHAPE, dtype=torch.float32) + + device = torch.device("cuda:0") + pytorch_tensor = pytorch_tensor.to(device) + + dlpack_tensor = to_dlpack(pytorch_tensor) + pb_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", dlpack_tensor) + + inference_response = pb_utils.InferenceResponse(output_tensors=[pb_tensor]) + responses.append(inference_response) + + return responses diff --git a/qa/python_models/dlpack_identity/config.pbtxt b/qa/python_models/dlpack_identity/config.pbtxt new file mode 100644 index 0000000000..292d0df85b --- /dev/null +++ b/qa/python_models/dlpack_identity/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_identity" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] + +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value:"no" + } +} diff --git a/qa/python_models/dlpack_identity/model.py b/qa/python_models/dlpack_identity/model.py new file mode 100644 index 0000000000..1bd0748df9 --- /dev/null +++ b/qa/python_models/dlpack_identity/model.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """Identity model in Python backend that works with GPU and CPU + tensors.""" + + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor.from_dlpack( + "OUTPUT0", input_tensor.to_dlpack() + ) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/dlpack_io_identity/config.pbtxt b/qa/python_models/dlpack_io_identity/config.pbtxt new file mode 100644 index 0000000000..c79595f64a --- /dev/null +++ b/qa/python_models/dlpack_io_identity/config.pbtxt @@ -0,0 +1,67 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_io_identity" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +input [ + { + name: "GPU_OUTPUT" + data_type: TYPE_BOOL + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "NEXT_GPU_OUTPUT" + data_type: TYPE_BOOL + dims: [ -1 ] + } +] + +instance_group [{kind : KIND_CPU}] + +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value:"no" + } +} diff --git a/qa/python_models/dlpack_io_identity/model.py b/qa/python_models/dlpack_io_identity/model.py new file mode 100644 index 0000000000..225d026992 --- /dev/null +++ b/qa/python_models/dlpack_io_identity/model.py @@ -0,0 +1,108 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + + +class TritonPythonModel: + """ + This Python identity model passes the DLPack tensors as is. "OUTPUT_IS_GPU" + input controls whether the model should put the output in GPU or in CPU. + """ + + def initialize(self, args): + self._model_name = args["model_name"] + + def execute(self, requests): + responses = [] + for request in requests: + input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + gpu_output = pb_utils.get_input_tensor_by_name( + request, "GPU_OUTPUT" + ).as_numpy() + + if input0.is_cpu(): + if not gpu_output[0]: + output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack()) + else: + outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda() + output0 = pb_utils.Tensor.from_dlpack( + "OUTPUT0", to_dlpack(outptu0_pytorch) + ) + else: + if gpu_output[0]: + output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack()) + else: + outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu() + output0 = pb_utils.Tensor.from_dlpack( + "OUTPUT0", to_dlpack(outptu0_pytorch) + ) + + next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:]) + + # Do not perform BLS inference if it is the first + # model in the pipeline. + if self._model_name != "dlpack_io_identity_1": + infer_request = pb_utils.InferenceRequest( + model_name="dlpack_io_identity_1", + inputs=[ + input0, + pb_utils.get_input_tensor_by_name(request, "GPU_OUTPUT"), + ], + requested_output_names=["OUTPUT0"], + ) + infer_response = infer_request.exec() + + if infer_response.has_error(): + raise pb_utils.TritonModelException( + infer_response.error().message() + ) + + bls_output0 = pb_utils.get_output_tensor_by_name( + infer_response, "OUTPUT0" + ) + if not output0.is_cpu(): + bls_output0 = ( + from_dlpack(bls_output0.to_dlpack()).detach().cpu().numpy() + ) + else: + bls_output0 = bls_output0.as_numpy() + + if not input0.is_cpu(): + input0 = from_dlpack(input0.to_dlpack()).detach().cpu().numpy() + else: + input0 = input0.as_numpy() + + if not np.allclose(bls_output0, input0): + raise pb_utils.TritonModelException( + "BLS input and output tensors are not equal" + ) + + responses.append(pb_utils.InferenceResponse([output0, next_gpu_output])) + + return responses diff --git a/qa/python_models/dlpack_io_identity_decoupled/config.pbtxt b/qa/python_models/dlpack_io_identity_decoupled/config.pbtxt new file mode 100644 index 0000000000..6bf39810a5 --- /dev/null +++ b/qa/python_models/dlpack_io_identity_decoupled/config.pbtxt @@ -0,0 +1,71 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_io_identity_decoupled" +backend: "python" +max_batch_size: 0 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +input [ + { + name: "GPU_OUTPUT" + data_type: TYPE_BOOL + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "NEXT_GPU_OUTPUT" + data_type: TYPE_BOOL + dims: [ -1 ] + } +] + +instance_group [{kind : KIND_CPU}] + +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value:"no" + } +} diff --git a/qa/python_models/dlpack_io_identity_decoupled/model.py b/qa/python_models/dlpack_io_identity_decoupled/model.py new file mode 100644 index 0000000000..5f4e597df8 --- /dev/null +++ b/qa/python_models/dlpack_io_identity_decoupled/model.py @@ -0,0 +1,113 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import threading +import time + +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + + +class TritonPythonModel: + """ + This Python identity model passes the DLPack tensors as is. "OUTPUT_IS_GPU" + input controls whether the model should put the output in GPU or in CPU. + """ + + def initialize(self, args): + self._model_name = args["model_name"] + self.inflight_thread_count = 0 + self.inflight_thread_count_lck = threading.Lock() + + def response_thread(self, response_sender, input0, gpu_output): + # Sleep 5 seconds to make sure the main thread has exited. + time.sleep(5) + + if input0.is_cpu(): + if not gpu_output[0]: + output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack()) + else: + outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda() + output0 = pb_utils.Tensor.from_dlpack( + "OUTPUT0", to_dlpack(outptu0_pytorch) + ) + else: + if gpu_output[0]: + output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack()) + else: + output0_pytorch = from_dlpack(input0.to_dlpack()).cpu() + output0 = pb_utils.Tensor.from_dlpack( + "OUTPUT0", to_dlpack(output0_pytorch) + ) + + next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:]) + infer_response = pb_utils.InferenceResponse([output0, next_gpu_output]) + + # Number of times to repeat the response + response_repeat = 2 + for _ in range(response_repeat): + response_sender.send(infer_response) + + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + with self.inflight_thread_count_lck: + self.inflight_thread_count -= 1 + + def execute(self, requests): + for request in requests: + input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + gpu_output = pb_utils.get_input_tensor_by_name( + request, "GPU_OUTPUT" + ).as_numpy() + + thread = threading.Thread( + target=self.response_thread, + args=(request.get_response_sender(), input0, gpu_output), + ) + + thread.daemon = True + + with self.inflight_thread_count_lck: + self.inflight_thread_count += 1 + + thread.start() + + def finalize(self): + inflight_threads = True + cycles = 0 + logging_time_sec = 5 + sleep_time_sec = 0.1 + cycle_to_log = logging_time_sec / sleep_time_sec + while inflight_threads: + with self.inflight_thread_count_lck: + inflight_threads = self.inflight_thread_count != 0 + if cycles % cycle_to_log == 0: + print( + f"Waiting for {self.inflight_thread_count} response threads to complete..." + ) + if inflight_threads: + time.sleep(sleep_time_sec) + cycles += 1 diff --git a/qa/python_models/dlpack_square/config.pbtxt b/qa/python_models/dlpack_square/config.pbtxt new file mode 100644 index 0000000000..15cf6b7fd2 --- /dev/null +++ b/qa/python_models/dlpack_square/config.pbtxt @@ -0,0 +1,48 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_square" +backend: "python" +max_batch_size: 0 +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +instance_group [{ kind: KIND_CPU }] + diff --git a/qa/python_models/dlpack_square/model.py b/qa/python_models/dlpack_square/model.py new file mode 100644 index 0000000000..b31531461e --- /dev/null +++ b/qa/python_models/dlpack_square/model.py @@ -0,0 +1,139 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import threading + +import numpy as np +import torch + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + +numpy_to_pytorch_dtype = { + np.bool_: torch.bool, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, +} + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output_config = pb_utils.get_output_config_by_name(model_config, "OUT") + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config + ) + if not using_decoupled: + raise pb_utils.TritonModelException( + """the model `{}` can generate any number of responses per request, + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) + + self.inflight_thread_count = 0 + self.inflight_thread_count_lck = threading.Lock() + + def execute(self, requests): + for request in requests: + self.process_request(request) + + return None + + def process_request(self, request): + # Start a separate thread to send the responses for the request. The + # sending back the responses is delegated to this thread. + thread = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + pb_utils.get_input_tensor_by_name(request, "IN"), + self.output_dtype, + ), + ) + + thread.daemon = True + + with self.inflight_thread_count_lck: + self.inflight_thread_count += 1 + + thread.start() + + def response_thread(self, response_sender, in_input, output_dtype): + # The response_sender is used to send response(s) associated with the + # corresponding request. + + for idx in range(in_input.as_numpy()[0]): + if in_input.is_cpu(): + if ( + in_input.as_numpy().dtype.type is np.bytes_ + or in_input.as_numpy().dtype == np.object_ + ): + out_0 = in_input.as_numpy().astype(np.int32) + out_tensor = pb_utils.Tensor("OUT", out_0.astype(output_dtype)) + else: + in_0_pytorch = from_dlpack(in_input.to_dlpack()) + out_0 = in_0_pytorch + if output_dtype == np.object_: + out_tensor = pb_utils.Tensor( + "OUT", out_0.numpy().astype(output_dtype) + ) + else: + out_0 = out_0.type(numpy_to_pytorch_dtype[output_dtype]) + out_tensor = pb_utils.Tensor.from_dlpack( + "OUT", to_dlpack(out_0) + ) + else: + in_0_pytorch = from_dlpack(in_input.to_dlpack()).cuda() + out_0 = in_0_pytorch + out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(out_0)) + + response = pb_utils.InferenceResponse(output_tensors=[out_tensor]) + response_sender.send(response) + + # We must close the response sender to indicate to Triton that we are + # done sending responses for the corresponding request. We can't use the + # response sender after closing it. The response sender is closed by + # setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL. + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + with self.inflight_thread_count_lck: + self.inflight_thread_count -= 1 diff --git a/qa/python_models/dlpack_sub_add/config.pbtxt b/qa/python_models/dlpack_sub_add/config.pbtxt new file mode 100644 index 0000000000..c9614476c1 --- /dev/null +++ b/qa/python_models/dlpack_sub_add/config.pbtxt @@ -0,0 +1,66 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_sub_add" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{kind : KIND_CPU}] + +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value:"no" + } +} diff --git a/qa/python_models/dlpack_sub_add/model.py b/qa/python_models/dlpack_sub_add/model.py new file mode 100644 index 0000000000..16caafcea2 --- /dev/null +++ b/qa/python_models/dlpack_sub_add/model.py @@ -0,0 +1,130 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + self.numpy_to_pytorch_dtype = { + np.bool_: torch.bool, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, + } + + def execute(self, requests): + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + # If both of the tensors are in CPU, use NumPy. + if in_0.is_cpu() and in_1.is_cpu(): + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) + - in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) + + in_1.as_numpy().astype(np.int32), + ) + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", out_0.astype(output0_dtype) + ) + out_tensor_1 = pb_utils.Tensor( + "OUTPUT1", out_1.astype(output1_dtype) + ) + else: + in_0_pytorch, in_1_pytorch = from_dlpack( + in_0.to_dlpack() + ), from_dlpack(in_1.to_dlpack()) + out_0, out_1 = ( + in_0_pytorch - in_1_pytorch, + in_0_pytorch + in_1_pytorch, + ) + + if self.output0_dtype == np.object_: + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", out_0.numpy().astype(output0_dtype) + ) + else: + out_0 = out_0.type(self.numpy_to_pytorch_dtype[output0_dtype]) + out_tensor_0 = pb_utils.Tensor.from_dlpack( + "OUTPUT0", to_dlpack(out_0) + ) + + if self.output1_dtype == np.object_: + out_tensor_1 = pb_utils.Tensor( + "OUTPUT1", out_1.numpy().astype(output1_dtype) + ) + else: + out_1 = out_1.type(self.numpy_to_pytorch_dtype[output1_dtype]) + out_tensor_1 = pb_utils.Tensor.from_dlpack( + "OUTPUT1", to_dlpack(out_1) + ) + + else: + in_0_pytorch, in_1_pytorch = ( + from_dlpack(in_0.to_dlpack()).cuda(), + from_dlpack(in_1.to_dlpack()).cuda(), + ) + out_0, out_1 = ( + in_0_pytorch - in_1_pytorch, + in_0_pytorch + in_1_pytorch, + ) + out_tensor_0 = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(out_0)) + out_tensor_1 = pb_utils.Tensor.from_dlpack("OUTPUT1", to_dlpack(out_1)) + + responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) + + return responses diff --git a/qa/python_models/dlpack_test/config.pbtxt b/qa/python_models/dlpack_test/config.pbtxt new file mode 100644 index 0000000000..930c71ff74 --- /dev/null +++ b/qa/python_models/dlpack_test/config.pbtxt @@ -0,0 +1,44 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "dlpack_test" +backend: "python" +max_batch_size: 0 + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/dlpack_test/model.py b/qa/python_models/dlpack_test/model.py new file mode 100644 index 0000000000..64bc7d6692 --- /dev/null +++ b/qa/python_models/dlpack_test/model.py @@ -0,0 +1,343 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import cupy as cp +import numpy as np +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import from_dlpack, to_dlpack + + +class PBTensorTest(unittest.TestCase): + def test_pytorch_dlpack(self): + # Test different dtypes + pytorch_dtypes = [ + torch.float16, + torch.float32, + torch.float64, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.uint8, + ] + + for pytorch_dtype in pytorch_dtypes: + pytorch_tensor = torch.ones([100], dtype=pytorch_dtype) + dlpack_tensor = to_dlpack(pytorch_tensor) + pb_tensor = pb_utils.Tensor.from_dlpack("test_tensor", dlpack_tensor) + self.assertTrue( + np.array_equal(pb_tensor.as_numpy(), pytorch_tensor.numpy()) + ) + + # Convert the tensor back to DLPack and ensure that both tensors are + # the same + pytorch_tensor_dlpack = from_dlpack(pb_tensor.to_dlpack()) + self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor)) + + self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type()) + + # Now let's check that upgraded DLPack implementation also + # works as expected, i.e. from_dlpack should work with + # external pytorch tensor directly + + pb_tensor_upgraded = pb_utils.Tensor.from_dlpack( + "test_tensor", pytorch_tensor + ) + self.assertTrue( + np.array_equal(pb_tensor_upgraded.as_numpy(), pytorch_tensor.numpy()) + ) + + # Here we check that `pb_tensor` as a producer, properly + # invokes `__dlpack__` and `__dlpack_device__` + pytorch_tensor_dlpack = from_dlpack(pb_tensor_upgraded) + self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor)) + + self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type()) + + def test_non_contiguous_error(self): + pytorch_tensor = torch.rand([20, 30], dtype=torch.float16) + + # Transposing a PyTorch tensor leads to a non contiguous tensor. + pytorch_tensor = torch.transpose(pytorch_tensor, 0, 1) + + with self.assertRaises(Exception) as e: + pb_utils.Tensor.from_dlpack("test_tensor", to_dlpack(pytorch_tensor)) + self.assertTrue( + str(e.exception) + == "DLPack tensor is not contiguous. Only contiguous DLPack tensors that are stored in C-Order are supported." + ) + + def test_dlpack_string_tensor(self): + np_object = np.array(["An Example String"], dtype=np.object_) + pb_tensor = pb_utils.Tensor("test_tensor", np_object) + + with self.assertRaises(Exception) as e: + pb_tensor.to_dlpack() + + self.assertTrue( + str(e.exception) == "DLPack does not have support for string tensors." + ) + + def test_dlpack_gpu_tensors(self): + # Test different dtypes + # PyTorch does not support DLPack bool type yet: + # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/DLConvertor.cpp + pytorch_dtypes = [ + torch.float16, + torch.float32, + torch.float64, + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.uint8, + ] + + for pytorch_dtype in pytorch_dtypes: + pytorch_tensor = torch.ones([100], dtype=pytorch_dtype, device="cuda") + dlpack_tensor = to_dlpack(pytorch_tensor) + pb_tensor = pb_utils.Tensor.from_dlpack("test_tensor", dlpack_tensor) + + # Convert the tensor back to DLPack and ensure that both tensors are + # the same + pytorch_tensor_dlpack = from_dlpack(pb_tensor.to_dlpack()) + self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor)) + self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type()) + + # Now we make sure that updated DLPack implementation works + # with GPU as well + pb_tensor = pb_utils.Tensor.from_dlpack("test_tensor", pytorch_tensor) + pytorch_tensor_dlpack = from_dlpack(pb_tensor) + self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor)) + self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type()) + + def test_dlpack_gpu_numpy(self): + # DLPack tesnors that are in GPU cannot be converted to NumPy + pytorch_tensor = torch.rand([100], dtype=torch.float16, device="cuda") * 100 + pb_tensor = pb_utils.Tensor.from_dlpack("tensor", to_dlpack(pytorch_tensor)) + # Make sure that `__dlpack_device__` works as expected + self.assertFalse(pb_tensor.is_cpu()) + self.assertTrue(pytorch_tensor.is_cuda) + self.assertEqual( + pb_tensor.__dlpack_device__(), pytorch_tensor.__dlpack_device__() + ) + + with self.assertRaises(Exception) as e: + pb_tensor.as_numpy() + self.assertTrue( + str(e.exception) + == "Tensor is stored in GPU and cannot be converted to NumPy." + ) + + def test_dlpack_cpu_numpy(self): + # Check compatibiity of PbTensor DLPack implementation + # with numpy + pytorch_tensor = torch.rand([100], dtype=torch.float16, device="cpu") * 100 + pb_tensor = pb_utils.Tensor.from_dlpack("tensor", pytorch_tensor) + numpy_tensor_dlpack = np.from_dlpack(pb_tensor) + self.assertTrue(np.array_equal(numpy_tensor_dlpack, pytorch_tensor.numpy())) + # Make sure that `__dlpack_device__` works as expected + self.assertTrue(pb_tensor.is_cpu()) + self.assertFalse(pytorch_tensor.is_cuda) + self.assertEqual( + pb_tensor.__dlpack_device__(), pytorch_tensor.__dlpack_device__() + ) + + def test_bool_datatype(self): + # [FIXME] pass bool_array directly to `pb_utils.Tensor.from_dlpack`, + # when numpy release supports DLPack bool type + bool_array = np.asarray([False, True]) + bool_tensor = pb_utils.Tensor("tensor", bool_array) + bool_tensor_dlpack = pb_utils.Tensor.from_dlpack("tensor", bool_tensor) + self.assertTrue(np.array_equal(bool_array, bool_tensor_dlpack.as_numpy())) + + def test_cuda_multi_stream(self): + # Test that external stream syncs with the default + # and pb_tensor has proper data + size = 5000 + pytorch_tensor_1 = torch.tensor([0, 0, 0, 0], device="cuda") + pytorch_tensor_2 = torch.tensor([0, 0, 0, 0], device="cuda") + expected_output = torch.tensor([2, 2, 2, 2], device="cuda") + s1 = torch.cuda.Stream() + with torch.cuda.stream(s1): + matrix_a = torch.randn(size, size, device="cuda") + res = torch.matmul(matrix_a, matrix_a) + for _ in range(1000): + res = torch.matmul(res, matrix_a) + pytorch_tensor_1 += torch.tensor([2, 2, 2, 2], device="cuda") + pytorch_tensor_2 += torch.tensor([2, 2, 2, 2], device="cuda") + + pb_tensor_1 = pb_utils.Tensor.from_dlpack("tensor", pytorch_tensor_1) + pb_tensor_2 = pb_utils.Tensor.from_dlpack("tensor", to_dlpack(pytorch_tensor_2)) + pytorch_tensor_dlpack = from_dlpack(pb_tensor_1) + self.assertTrue(torch.equal(pytorch_tensor_dlpack, expected_output)) + pytorch_tensor_dlpack = from_dlpack(pb_tensor_2) + self.assertTrue(torch.equal(pytorch_tensor_dlpack, expected_output)) + + def test_cuda_non_blocking_multi_stream(self): + # Test that external non-blocking stream syncs with the default stream + # and pb_tensor has proper data + size = 5000 + cupy_tensor = cp.array([0, 0, 0, 0]) + expected_output = cp.array([2, 2, 2, 2]) + non_blocking_stream = cp.cuda.Stream(non_blocking=True) + with non_blocking_stream: + matrix_a = cp.random.rand(size, size) + res = cp.matmul(matrix_a, matrix_a) + for _ in range(1000): + res = cp.matmul(res, matrix_a) + cupy_tensor += cp.array([2, 2, 2, 2]) + + pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor) + # Verify that non-blocking stream has no pending jobs left + self.assertTrue(non_blocking_stream.done) + cupy_tensor_dlpack = cp.from_dlpack(pb_tensor) + self.assertTrue(cp.array_equal(cupy_tensor_dlpack, expected_output)) + self.assertFalse(pb_tensor.is_cpu()) + self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__()) + + def test_cuda_multi_gpu(self): + # Test that when `pb_utils.Tensor.from_dlpack` is called on different + # GPU from where external tensor is stored, we receive a pointer + # and all pending work on different GPU's default stream + # on external tensor is done + size = 5000 + # DLDeviceType::kDLCUDA, device_id 1 + expected_dlpack_device = (2, 1) + with cp.cuda.Device(1): + expected_output = cp.array([2, 2, 2, 2]) + cupy_tensor = cp.array([0, 0, 0, 0]) + matrix_a = cp.random.rand(size, size) + res = cp.matmul(matrix_a, matrix_a) + for _ in range(1000): + res = cp.matmul(res, matrix_a) + cupy_tensor += cp.array([2, 2, 2, 2]) + with cp.cuda.Device(0): + pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor) + with cp.cuda.Device(1): + # To make sure that the default stream is done with + # all compute work + self.assertTrue(cp.cuda.Stream(null=True).done) + cupy_tensor_dlpack = cp.from_dlpack(pb_tensor) + + with cp.cuda.Device(1): + self.assertTrue(cp.array_equal(cupy_tensor_dlpack, expected_output)) + + self.assertFalse(pb_tensor.is_cpu()) + self.assertEqual(pb_tensor.__dlpack_device__(), expected_dlpack_device) + self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__()) + + def test_cuda_blocking_stream_multi_gpu(self): + # Test that when `pb_utils.Tensor.from_dlpack` is called on different + # GPU from where external tensor is stored, we receive a pointer + # and all pending work on different GPU's a blocking stream + # on external tensor is done + size = 5000 + # DLDeviceType::kDLCUDA, device_id 1 + expected_dlpack_device = (2, 1) + with cp.cuda.Device(1): + expected_output = cp.array([2, 2, 2, 2]) + blocking_stream = cp.cuda.Stream(non_blocking=False) + with blocking_stream: + cupy_tensor = cp.array([0, 0, 0, 0]) + matrix_a = cp.random.rand(size, size) + res = cp.matmul(matrix_a, matrix_a) + for _ in range(1000): + res = cp.matmul(res, matrix_a) + cupy_tensor += cp.array([2, 2, 2, 2]) + with cp.cuda.Device(0): + pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor) + with cp.cuda.Device(1): + # To make sure that blocking stream is done with + # all compute work + self.assertTrue(blocking_stream.done) + cupy_tensor_dlpack = cp.from_dlpack(pb_tensor) + + with cp.cuda.Device(1): + self.assertTrue(cp.array_equal(cupy_tensor_dlpack, expected_output)) + + self.assertFalse(pb_tensor.is_cpu()) + self.assertEqual(pb_tensor.__dlpack_device__(), expected_dlpack_device) + self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__()) + + def test_cuda_non_blocking_stream_multi_gpu(self): + # Test that when `pb_utils.Tensor.from_dlpack` is called on different + # GPU from where external tensor is stored, we receive a pointer + # and all pending work on different GPU's non-blocking stream + # on external tensor is done. + # This test seems to be affected by `test_cuda_multi_gpu` + # and `test_cuda_blocking_stream_multi_gpu` if GPUs 0 and 1 are used. + # Thus for this test, we use GPUs 0 and 2 + # JIRA: DLIS-4887 + size = 5000 + # DLDeviceType::kDLCUDA, device_id 1 + expected_dlpack_device = (2, 2) + with cp.cuda.Device(2): + expected_output = cp.array([2, 2, 2, 2]) + non_blocking_stream = cp.cuda.Stream(non_blocking=True) + with non_blocking_stream: + cupy_tensor = cp.array([0, 0, 0, 0]) + matrix_a = cp.random.rand(size, size) + res = cp.matmul(matrix_a, matrix_a) + for _ in range(1000): + res = cp.matmul(res, matrix_a) + cupy_tensor += cp.array([2, 2, 2, 2]) + with cp.cuda.Device(0): + pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor) + with cp.cuda.Device(2): + # To make sure that non_blocking stream is done with + # all compute work + self.assertTrue(non_blocking_stream.done) + cupy_tensor_dlpack = cp.from_dlpack(pb_tensor) + + with cp.cuda.Device(2): + self.assertTrue(cp.array_equal(cupy_tensor_dlpack, expected_output)) + + self.assertFalse(pb_tensor.is_cpu()) + self.assertEqual(pb_tensor.__dlpack_device__(), expected_dlpack_device) + self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__()) + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + for _ in requests: + # Run the unittest and store the results in InferenceResponse. + test = unittest.main("model", exit=False) + responses.append( + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) + return responses diff --git a/qa/python_models/ensemble/config.pbtxt b/qa/python_models/ensemble/config.pbtxt new file mode 100644 index 0000000000..680364fac0 --- /dev/null +++ b/qa/python_models/ensemble/config.pbtxt @@ -0,0 +1,98 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble" +platform: "ensemble" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "add_sub_1" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "output_0" + } + output_map { + key: "OUTPUT1" + value: "output_1" + } + }, + { + model_name: "add_sub_2" + model_version: -1 + input_map { + key: "INPUT0" + value: "output_0" + } + input_map { + key: "INPUT1" + value: "output_1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/python_models/ensemble_gpu/config.pbtxt b/qa/python_models/ensemble_gpu/config.pbtxt new file mode 100644 index 0000000000..e6c7ebf298 --- /dev/null +++ b/qa/python_models/ensemble_gpu/config.pbtxt @@ -0,0 +1,98 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble_gpu" +platform: "ensemble" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "add_sub_1" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "output_0" + } + output_map { + key: "OUTPUT1" + value: "output_1" + } + }, + { + model_name: "libtorch_float32_float32_float32" + model_version: -1 + input_map { + key: "INPUT0" + value: "output_0" + } + input_map { + key: "INPUT1" + value: "output_1" + } + output_map { + key: "OUTPUT__0" + value: "OUTPUT0" + } + output_map { + key: "OUTPUT__1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/python_models/ensemble_io/config.pbtxt b/qa/python_models/ensemble_io/config.pbtxt new file mode 100644 index 0000000000..8819601c53 --- /dev/null +++ b/qa/python_models/ensemble_io/config.pbtxt @@ -0,0 +1,124 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble_io" +platform: "ensemble" + +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +input [ + { + name: "GPU_OUTPUT" + data_type: TYPE_BOOL + dims: [ -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] +output [ + { + name: "NEXT_GPU_OUTPUT" + data_type: TYPE_BOOL + dims: [ -1 ] + } +] + +ensemble_scheduling { + step [ + { + model_name: "dlpack_io_identity_1" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "GPU_OUTPUT" + value: "GPU_OUTPUT" + } + output_map { + key: "OUTPUT0" + value: "output_0" + } + output_map { + key: "NEXT_GPU_OUTPUT" + value: "next_gpu_output" + } + }, + { + model_name: "dlpack_io_identity_2" + model_version: -1 + input_map { + key: "INPUT0" + value: "output_0" + } + input_map { + key: "GPU_OUTPUT" + value: "next_gpu_output" + } + output_map { + key: "OUTPUT0" + value: "output_1" + } + output_map { + key: "NEXT_GPU_OUTPUT" + value: "next_gpu_output_1" + } + }, + { + model_name: "dlpack_io_identity_3" + model_version: -1 + input_map { + key: "INPUT0" + value: "output_1" + } + input_map { + key: "GPU_OUTPUT" + value: "next_gpu_output_1" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + output_map { + key: "NEXT_GPU_OUTPUT" + value: "NEXT_GPU_OUTPUT" + } + } + ] +} diff --git a/qa/python_models/error_code/config.pbtxt b/qa/python_models/error_code/config.pbtxt new file mode 100644 index 0000000000..90fd5eb1e3 --- /dev/null +++ b/qa/python_models/error_code/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "error_code" +backend: "python" +max_batch_size: 4 + +input [ + { + name: "ERROR_CODE" + data_type: TYPE_STRING + dims: [ 1 ] + } +] + +output [ + { + name: "DUMMY_OUT" + data_type: TYPE_STRING + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/error_code/model.py b/qa/python_models/error_code/model.py new file mode 100644 index 0000000000..078a4afb73 --- /dev/null +++ b/qa/python_models/error_code/model.py @@ -0,0 +1,59 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + error_code_map = { + "UNKNOWN": pb_utils.TritonError.UNKNOWN, + "INTERNAL": pb_utils.TritonError.INTERNAL, + "NOT_FOUND": pb_utils.TritonError.NOT_FOUND, + "INVALID_ARG": pb_utils.TritonError.INVALID_ARG, + "UNAVAILABLE": pb_utils.TritonError.UNAVAILABLE, + "UNSUPPORTED": pb_utils.TritonError.UNSUPPORTED, + "ALREADY_EXISTS": pb_utils.TritonError.ALREADY_EXISTS, + "CANCELLED": pb_utils.TritonError.CANCELLED, + } + + responses = [] + + for request in requests: + err_code_tensor = pb_utils.get_input_tensor_by_name( + request, "ERROR_CODE" + ).as_numpy() + err_code_str = str(err_code_tensor[0][0], encoding="utf-8") + if err_code_str in error_code_map: + error = pb_utils.TritonError( + message=("error code: " + err_code_str), + code=error_code_map[err_code_str], + ) + else: + error = pb_utils.TritonError("unrecognized error code: " + err_code_str) + responses.append(pb_utils.InferenceResponse(error=error)) + + return responses diff --git a/qa/python_models/execute_cancel/config.pbtxt b/qa/python_models/execute_cancel/config.pbtxt new file mode 100644 index 0000000000..df509863ad --- /dev/null +++ b/qa/python_models/execute_cancel/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "execute_cancel" +backend: "python" +max_batch_size: 1 + +input [ + { + name: "EXECUTE_DELAY" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "DUMMY_OUT" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/execute_cancel/model.py b/qa/python_models/execute_cancel/model.py new file mode 100644 index 0000000000..ec7b96ec1a --- /dev/null +++ b/qa/python_models/execute_cancel/model.py @@ -0,0 +1,108 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import threading +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self._logger = pb_utils.Logger + self._model_config = json.loads(args["model_config"]) + self._using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + self._model_config + ) + + def execute(self, requests): + processed_requests = [] + for request in requests: + delay_tensor = pb_utils.get_input_tensor_by_name( + request, "EXECUTE_DELAY" + ).as_numpy() + delay = delay_tensor[0][0] # seconds + if self._using_decoupled: + processed_requests.append( + {"response_sender": request.get_response_sender(), "delay": delay} + ) + else: + processed_requests.append({"request": request, "delay": delay}) + if self._using_decoupled: + return self._execute_decoupled(processed_requests) + return self._execute_processed_requests(processed_requests) + + def _execute_processed_requests(self, processed_requests): + responses = [] + for processed_request in processed_requests: + error = pb_utils.TritonError(message="not cancelled") + object_to_check_cancelled = None + if "response_sender" in processed_request: + object_to_check_cancelled = processed_request["response_sender"] + elif "request" in processed_request: + object_to_check_cancelled = processed_request["request"] + delay = processed_request["delay"] # seconds + time_elapsed = 0.0 # seconds + while time_elapsed < delay: + time.sleep(1) + time_elapsed += 1.0 + if object_to_check_cancelled.is_cancelled(): + self._logger.log_info( + "[execute_cancel] Request cancelled at " + + str(time_elapsed) + + " s" + ) + error = pb_utils.TritonError( + message="cancelled", code=pb_utils.TritonError.CANCELLED + ) + break + self._logger.log_info( + "[execute_cancel] Request not cancelled at " + + str(time_elapsed) + + " s" + ) + responses.append(pb_utils.InferenceResponse(error=error)) + return responses + + def _execute_decoupled(self, processed_requests): + def response_thread(execute_processed_requests, processed_requests): + time.sleep(2) # execute after requests are released + responses = execute_processed_requests(processed_requests) + for i in range(len(responses)): # len(responses) == len(processed_requests) + response_sender = processed_requests[i]["response_sender"] + response_sender.send(responses[i]) + response_sender.send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + + thread = threading.Thread( + target=response_thread, + args=(self._execute_processed_requests, processed_requests), + ) + thread.daemon = True + thread.start() + return None diff --git a/qa/python_models/execute_delayed_model/config.pbtxt b/qa/python_models/execute_delayed_model/config.pbtxt new file mode 100644 index 0000000000..0a4ee59d3e --- /dev/null +++ b/qa/python_models/execute_delayed_model/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple" +backend: "python" +max_batch_size: 8 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 16 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 16 ] + } +] + +instance_group [ { kind: KIND_CPU }] diff --git a/qa/python_models/execute_delayed_model/model.py b/qa/python_models/execute_delayed_model/model.py new file mode 100644 index 0000000000..055b321a93 --- /dev/null +++ b/qa/python_models/execute_delayed_model/model.py @@ -0,0 +1,72 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + responses = [] + + time.sleep(15) + + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1] + ) + responses.append(inference_response) + + return responses + + def finalize(self): + print("Cleaning up...") diff --git a/qa/python_models/execute_error/config.pbtxt b/qa/python_models/execute_error/config.pbtxt new file mode 100644 index 0000000000..66a8b0a797 --- /dev/null +++ b/qa/python_models/execute_error/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "execute_error" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/execute_error/model.py b/qa/python_models/execute_error/model.py new file mode 100644 index 0000000000..9ecdbff816 --- /dev/null +++ b/qa/python_models/execute_error/model.py @@ -0,0 +1,50 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """This function is called on inference request.""" + responses = [] + + # Generate the error for the first and third request + i = 0 + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + if i == 0: + error = pb_utils.TritonError("An error occurred during execution") + responses.append(pb_utils.InferenceResponse([out_tensor], error)) + elif i == 1: + responses.append(pb_utils.InferenceResponse([out_tensor])) + elif i == 2: + error = pb_utils.TritonError("An error occurred during execution") + responses.append(pb_utils.InferenceResponse(error=error)) + i += 1 + + return responses diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt new file mode 100644 index 0000000000..70e247148a --- /dev/null +++ b/qa/python_models/execute_grpc_error/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py new file mode 100644 index 0000000000..d5087a49ec --- /dev/null +++ b/qa/python_models/execute_grpc_error/model.py @@ -0,0 +1,52 @@ +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def __init__(self): + # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure + self.inf_count = 1 + + def execute(self, requests): + """This function is called on inference request.""" + responses = [] + + # Generate the error for the second request + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + if self.inf_count % 2: + # Every odd request is success + responses.append(pb_utils.InferenceResponse([out_tensor])) + else: + # Every even request is failure + error = pb_utils.TritonError("An error occurred during execution") + responses.append(pb_utils.InferenceResponse([out_tensor], error)) + self.inf_count += 1 + + return responses diff --git a/qa/python_models/execute_return_error/config.pbtxt b/qa/python_models/execute_return_error/config.pbtxt new file mode 100644 index 0000000000..0a31be6e0d --- /dev/null +++ b/qa/python_models/execute_return_error/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "execute_return_error" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/execute_return_error/model.py b/qa/python_models/execute_return_error/model.py new file mode 100644 index 0000000000..e304441f04 --- /dev/null +++ b/qa/python_models/execute_return_error/model.py @@ -0,0 +1,43 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +class TritonPythonModel: + def initialize(self, args): + self._i = -1 + + def execute(self, requests): + """ + Tests returning invalid responses in execute request. + """ + + self._i += 1 + i = self._i + + if i % 2 == 0: + return None + else: + return [None] * len(requests) diff --git a/qa/python_models/fan_add_sub/config.pbtxt b/qa/python_models/fan_add_sub/config.pbtxt new file mode 100644 index 0000000000..1b2a437488 --- /dev/null +++ b/qa/python_models/fan_add_sub/config.pbtxt @@ -0,0 +1,139 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "fan_add_sub" +platform: "ensemble" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + + + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + + + } +] +ensemble_scheduling { + step [ + { + model_name: "nop_TYPE_FP32_-1" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + input_map { + key: "INPUT1" + value: "INPUT1" + } + output_map { + key: "OUTPUT0" + value: "same_input0" + } + output_map { + key: "OUTPUT1" + value: "same_input1" + } + }, + { + model_name: "ENSEMBLE_MODEL_NAME" + model_version: -1 + input_map { + key: "INPUT0" + value: "same_input0" + } + input_map { + key: "INPUT1" + value: "same_input1" + } + output_map { + key: "OUTPUT0" + value: "same_output0" + } + output_map { + key: "OUTPUT1" + value: "same_output1" + } + }, + { + model_name: "nop_TYPE_FP32_-1" + model_version: -1 + input_map { + key: "INPUT0" + value: "same_output0" + } + input_map { + key: "INPUT1" + value: "same_output0" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + }, + { + model_name: "nop_TYPE_FP32_-1" + model_version: -1 + input_map { + key: "INPUT0" + value: "same_output1" + } + input_map { + key: "INPUT1" + value: "same_output1" + } + output_map { + key: "OUTPUT1" + value: "OUTPUT1" + } + } + ] +} diff --git a/qa/python_models/fini_error/config.pbtxt b/qa/python_models/fini_error/config.pbtxt new file mode 100644 index 0000000000..182f9b8adf --- /dev/null +++ b/qa/python_models/fini_error/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "fini_error" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/fini_error/model.py b/qa/python_models/fini_error/model.py new file mode 100644 index 0000000000..7a9f409aee --- /dev/null +++ b/qa/python_models/fini_error/model.py @@ -0,0 +1,44 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + The body of this model doesn't matter. The main purpose of this model is + to test correct handling of Python errors in the `finalize` function. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor], error)) + return responses + + def finalize(self): + undefined_variable diff --git a/qa/python_models/ground_truth/config.pbtxt b/qa/python_models/ground_truth/config.pbtxt new file mode 100644 index 0000000000..2b7a7d19a2 --- /dev/null +++ b/qa/python_models/ground_truth/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ground_truth" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/ground_truth/model.py b/qa/python_models/ground_truth/model.py new file mode 100644 index 0000000000..24a286e300 --- /dev/null +++ b/qa/python_models/ground_truth/model.py @@ -0,0 +1,51 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Mock Model that uses the input data to determine how long to wait + before returning identity data + """ + assert len(requests) == 1 + delay = 0 + request = requests[0] + responses = [] + + delay_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + delay_as_numpy = delay_tensor.as_numpy() + delay = float(delay_as_numpy[0][0]) + + out_tensor = pb_utils.Tensor("OUTPUT0", delay_as_numpy) + responses.append(pb_utils.InferenceResponse([out_tensor])) + + time.sleep(delay) + return responses diff --git a/qa/python_models/identity_bf16/config.pbtxt b/qa/python_models/identity_bf16/config.pbtxt new file mode 100644 index 0000000000..e4d7df06c1 --- /dev/null +++ b/qa/python_models/identity_bf16/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_BF16 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_BF16 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/identity_bf16/model.py b/qa/python_models/identity_bf16/model.py new file mode 100644 index 0000000000..57756073b9 --- /dev/null +++ b/qa/python_models/identity_bf16/model.py @@ -0,0 +1,88 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import torch +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + # You must parse model_config. JSON string is not parsed here + self.model_config = json.loads(args["model_config"]) + + # Get tensor configurations for testing/validation + self.input0_config = pb_utils.get_input_config_by_name( + self.model_config, "INPUT0" + ) + self.output0_config = pb_utils.get_output_config_by_name( + self.model_config, "OUTPUT0" + ) + + def validate_bf16_tensor(self, tensor, tensor_config): + # I/O datatypes can be queried from the model config if needed + dtype = tensor_config["data_type"] + if dtype != "TYPE_BF16": + raise Exception(f"Expected a BF16 tensor, but got {dtype} instead.") + + # Converting BF16 tensors to numpy is not supported, and DLPack + # should be used instead via to_dlpack and from_dlpack. + try: + _ = tensor.as_numpy() + except pb_utils.TritonModelException as e: + expected_error = "tensor dtype is bf16 and cannot be converted to numpy" + assert expected_error in str(e).lower() + else: + raise Exception("Expected BF16 conversion to numpy to fail") + + def execute(self, requests): + """ + Identity model in Python backend with example BF16 and PyTorch usage. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + + # Numpy does not support BF16, so use DLPack instead. + bf16_dlpack = input_tensor.to_dlpack() + + # OPTIONAL: The tensor can be converted to other dlpack-compatible + # frameworks like PyTorch and TensorFlow with their dlpack utilities. + torch_tensor = torch.utils.dlpack.from_dlpack(bf16_dlpack) + + # When complete, convert back to a pb_utils.Tensor via DLPack. + output_tensor = pb_utils.Tensor.from_dlpack( + "OUTPUT0", torch.utils.dlpack.to_dlpack(torch_tensor) + ) + responses.append(pb_utils.InferenceResponse([output_tensor])) + + # NOTE: The following helper function is for testing and example + # purposes only, you should remove this in practice. + self.validate_bf16_tensor(input_tensor, self.input0_config) + self.validate_bf16_tensor(output_tensor, self.output0_config) + + return responses diff --git a/qa/python_models/identity_fp32/config.pbtxt b/qa/python_models/identity_fp32/config.pbtxt new file mode 100644 index 0000000000..6cf6312acb --- /dev/null +++ b/qa/python_models/identity_fp32/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_fp32" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/identity_fp32/model.py b/qa/python_models/identity_fp32/model.py new file mode 100644 index 0000000000..2161a1e732 --- /dev/null +++ b/qa/python_models/identity_fp32/model.py @@ -0,0 +1,40 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Identity model in Python backend. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/identity_fp32_logging/config.pbtxt b/qa/python_models/identity_fp32_logging/config.pbtxt new file mode 100644 index 0000000000..aaa4a2ee43 --- /dev/null +++ b/qa/python_models/identity_fp32_logging/config.pbtxt @@ -0,0 +1,53 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_fp32_logging" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] + diff --git a/qa/python_models/identity_fp32_logging/model.py b/qa/python_models/identity_fp32_logging/model.py new file mode 100644 index 0000000000..91ace61fd5 --- /dev/null +++ b/qa/python_models/identity_fp32_logging/model.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + logger = pb_utils.Logger + logger.log("Initialize-Specific Msg!", logger.INFO) + logger.log_info("Initialize-Info Msg!") + logger.log_warn("Initialize-Warning Msg!") + logger.log_error("Initialize-Error Msg!") + logger.log_verbose("Initialize-Verbose Msg!") + + def execute(self, requests): + """ + Identity model in Python backend. + """ + # Log as early as possible + logger = pb_utils.Logger + logger.log("Execute-Specific Msg!", logger.INFO) + logger.log_info("Execute-Info Msg!") + logger.log_warn("Execute-Warning Msg!") + logger.log_error("Execute-Error Msg!") + logger.log_verbose("Execute-Verbose Msg!") + + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + + # Log as late as possible + logger.log("Execute-Specific Msg!", logger.INFO) + logger.log_info("Execute-Info Msg!") + logger.log_warn("Execute-Warning Msg!") + logger.log_error("Execute-Error Msg!") + logger.log_verbose("Execute-Verbose Msg!") + + return responses + + def finalize(self): + logger = pb_utils.Logger + logger.log("Finalize-Specific Msg!", logger.INFO) + logger.log_info("Finalize-Info Msg!") + logger.log_warn("Finalize-Warning Msg!") + logger.log_error("Finalize-Error Msg!") + logger.log_verbose("Finalize-Verbose Msg!") diff --git a/qa/python_models/identity_fp32_timeout/config.pbtxt b/qa/python_models/identity_fp32_timeout/config.pbtxt new file mode 100644 index 0000000000..c14fd8e0a3 --- /dev/null +++ b/qa/python_models/identity_fp32_timeout/config.pbtxt @@ -0,0 +1,60 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_fp32_timeout" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] + +dynamic_batching { + default_queue_policy { + timeout_action: REJECT + allow_timeout_override: true + default_timeout_microseconds: 1000000 + } +} diff --git a/qa/python_models/identity_fp32_timeout/model.py b/qa/python_models/identity_fp32_timeout/model.py new file mode 100644 index 0000000000..356948e8de --- /dev/null +++ b/qa/python_models/identity_fp32_timeout/model.py @@ -0,0 +1,45 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Identity model in Python backend. + """ + logger = pb_utils.Logger + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + logger.log_info(f"Request timeout: {request.timeout()}") + time.sleep(5) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/init_args/config.pbtxt b/qa/python_models/init_args/config.pbtxt new file mode 100644 index 0000000000..4b0ba9e137 --- /dev/null +++ b/qa/python_models/init_args/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "init_args" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/init_args/model.py b/qa/python_models/init_args/model.py new file mode 100644 index 0000000000..0aa9bb2c7a --- /dev/null +++ b/qa/python_models/init_args/model.py @@ -0,0 +1,94 @@ +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +import numpy as np +import triton_python_backend_utils as pb_utils + + +def check_init_args(args): + expected_args = { + "model_name": "init_args", + "model_instance_name": "init_args_0_0", + "model_instance_kind": "CPU", + "model_instance_device_id": "0", + "model_version": "1", + } + is_win = sys.platform == "win32" + triton_dir = os.getenv( + "TRITON_DIR", "c:\\tritonserver" if is_win else "/opt/tritonserver" + ) + repo_path = triton_dir + "/qa/L0_backend_python/models/init_args" + expected_args["model_repository"] = ( + repo_path.replace("/", "\\") if is_win else repo_path + ) + + for arg in expected_args: + if args[arg] != expected_args[arg]: + raise pb_utils.TritonModelException( + arg + + ' does not contain correct value. Expected "' + + expected_args[arg] + + ", got " + + args[arg] + ) + + +class TritonPythonModel: + def initialize(self, args): + self.args = args + check_init_args(self.args) + + def execute(self, requests): + """ + This function counts the number of keys in the + "initialize" args argument to make sure that they are + correct. + """ + keys = [ + "model_config", + "model_instance_kind", + "model_instance_name", + "model_instance_device_id", + "model_repository", + "model_version", + "model_name", + ] + + correct_keys = 0 + for key in keys: + if key in list(self.args): + correct_keys += 1 + + responses = [] + for _ in requests: + out_args = pb_utils.Tensor( + "OUT", np.array([correct_keys], dtype=np.float32) + ) + responses.append(pb_utils.InferenceResponse([out_args])) + return responses diff --git a/qa/python_models/init_error/config.pbtxt b/qa/python_models/init_error/config.pbtxt new file mode 100644 index 0000000000..1e4457054f --- /dev/null +++ b/qa/python_models/init_error/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "init_error" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/init_error/model.py b/qa/python_models/init_error/model.py new file mode 100644 index 0000000000..654dc8ef2c --- /dev/null +++ b/qa/python_models/init_error/model.py @@ -0,0 +1,46 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = args["model_config"] + lorem_ipsum + + def execute(self, requests): + """ + The main purpose of this function is to check whether undefined + variables are correctly handled in `initialize` function. The body of + this function is never called or used. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor], error)) + return responses diff --git a/qa/python_models/init_exit/config.pbtxt b/qa/python_models/init_exit/config.pbtxt new file mode 100644 index 0000000000..a18aff189d --- /dev/null +++ b/qa/python_models/init_exit/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "init_exit" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/init_exit/model.py b/qa/python_models/init_exit/model.py new file mode 100644 index 0000000000..6e73eeaa59 --- /dev/null +++ b/qa/python_models/init_exit/model.py @@ -0,0 +1,49 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import signal +import sys +import time + + +class TritonPythonModel: + def initialize(self, args): + time.sleep(3) + # Simulate the case that the model goes out of memory and gets killed + # by the OOM killer + # NOTE: Windows runners use python 3.8 which do not have access to SIGKILL. + # We should remove this condition check when we upgrade the version of python. + # Online forums suggest 'CTRL_C_EVENT' should be the equivalent event, however, + # using this signal terminates the entire test, not just the server. SIGINT + # seems to work in the meantime. + if sys.platform == "win32": + os.kill(os.getpid(), signal.SIGINT) + else: + os.kill(os.getpid(), signal.SIGKILL) + + def execute(self, requests): + pass diff --git a/qa/python_models/iterative_sequence/config.pbtxt b/qa/python_models/iterative_sequence/config.pbtxt new file mode 100644 index 0000000000..faa1735718 --- /dev/null +++ b/qa/python_models/iterative_sequence/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "iterative_sequence" +backend: "python" +max_batch_size: 0 +model_transaction_policy { + decoupled: True +} +input [ + { + name: "IN" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +output [ + { + name: "OUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] +sequence_batching { + iterative_sequence : true +} + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/iterative_sequence/model.py b/qa/python_models/iterative_sequence/model.py new file mode 100644 index 0000000000..c45f82a607 --- /dev/null +++ b/qa/python_models/iterative_sequence/model.py @@ -0,0 +1,131 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """ + This model takes 1 input tensor, an INT32 [ 1 ] input named "IN", and + produces an output tensor "OUT" with the same shape as the input tensor. + The input value indicates the total number of responses to be generated and + the output value indicates the number of remaining responses. For example, + if the request input has value 2, the model will: + - Send a response with value 1. + - Release request with RESCHEDULE flag. + - When execute on the same request, send the last response with value 0. + - Release request with ALL flag. + """ + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + using_decoupled = pb_utils.using_decoupled_model_transaction_policy( + model_config + ) + if not using_decoupled: + raise pb_utils.TritonModelException( + """the model `{}` can generate any number of responses per request, + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) + + # Get IN configuration + in_config = pb_utils.get_input_config_by_name(model_config, "IN") + + # Validate the shape and data type of IN + in_shape = in_config["dims"] + if (len(in_shape) != 1) or (in_shape[0] != 1): + raise pb_utils.TritonModelException( + """the model `{}` requires the shape of 'IN' to be + [1], got {}""".format( + args["model_name"], in_shape + ) + ) + if in_config["data_type"] != "TYPE_INT32": + raise pb_utils.TritonModelException( + """the model `{}` requires the data_type of 'IN' to be + 'TYPE_INT32', got {}""".format( + args["model_name"], in_config["data_type"] + ) + ) + + # Get OUT configuration + out_config = pb_utils.get_output_config_by_name(model_config, "OUT") + + # Validate the shape and data type of OUT + out_shape = out_config["dims"] + if (len(out_shape) != 1) or (out_shape[0] != 1): + raise pb_utils.TritonModelException( + """the model `{}` requires the shape of 'OUT' to be + [1], got {}""".format( + args["model_name"], out_shape + ) + ) + if out_config["data_type"] != "TYPE_INT32": + raise pb_utils.TritonModelException( + """the model `{}` requires the data_type of 'OUT' to be + 'TYPE_INT32', got {}""".format( + args["model_name"], out_config["data_type"] + ) + ) + + self.remaining_response = 0 + self.reset_flag = True + + def execute(self, requests): + for request in requests: + in_input = pb_utils.get_input_tensor_by_name(request, "IN").as_numpy() + + if self.reset_flag: + self.remaining_response = in_input[0] + self.reset_flag = False + + response_sender = request.get_response_sender() + + self.remaining_response -= 1 + + out_output = pb_utils.Tensor( + "OUT", np.array([self.remaining_response], np.int32) + ) + response = pb_utils.InferenceResponse(output_tensors=[out_output]) + + if self.remaining_response <= 0: + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + request.set_release_flags( + pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE + ) + response_sender.send(response) + + return None diff --git a/qa/python_models/model_env/config.pbtxt b/qa/python_models/model_env/config.pbtxt new file mode 100644 index 0000000000..ca48cc00a2 --- /dev/null +++ b/qa/python_models/model_env/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "model_env" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/model_env/model.py b/qa/python_models/model_env/model.py new file mode 100644 index 0000000000..8cc9db8d81 --- /dev/null +++ b/qa/python_models/model_env/model.py @@ -0,0 +1,42 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + # Make sure that environment variables are correctly propagated + # to the Python models + if "MY_ENV" not in os.environ or os.environ["MY_ENV"] != "MY_ENV": + raise pb_utils.TritonModelException( + "MY_ENV doesn't exists or contains incorrect value" + ) + + def execute(self, requests): + pass diff --git a/qa/python_models/model_init_del/config.pbtxt b/qa/python_models/model_init_del/config.pbtxt new file mode 100644 index 0000000000..be66468a0a --- /dev/null +++ b/qa/python_models/model_init_del/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "model_init_del" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] # end instance_group diff --git a/qa/python_models/model_init_del/model.py b/qa/python_models/model_init_del/model.py new file mode 100644 index 0000000000..578279f8ef --- /dev/null +++ b/qa/python_models/model_init_del/model.py @@ -0,0 +1,57 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +import time + +import triton_python_backend_utils as pb_utils + +sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) +from util import get_delay, inc_count + + +class TritonPythonModel: + def initialize(self, args): + inc_count("initialize") + self._sleep("initialize") + + def execute(self, requests): + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + self._sleep("infer") + return responses + + def finalize(self): + inc_count("finalize") + + def _sleep(self, kind): + delay = get_delay(kind) + if delay > 0: + time.sleep(delay) diff --git a/qa/python_models/model_init_del/util.py b/qa/python_models/model_init_del/util.py new file mode 100755 index 0000000000..a36f13eea9 --- /dev/null +++ b/qa/python_models/model_init_del/util.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import fcntl +import os + +_model_name = "model_init_del" + +# +# Helper functions for reading/writing state to disk +# + + +def _get_number(filename): + full_path = os.path.join(os.environ["MODEL_LOG_DIR"], filename) + try: + with open(full_path, mode="r", encoding="utf-8", errors="strict") as f: + fcntl.lockf(f, fcntl.LOCK_SH) + txt = f.read() + except FileNotFoundError: + txt = "0" + return int(txt) + + +def _store_number(filename, number): + full_path = os.path.join(os.environ["MODEL_LOG_DIR"], filename) + txt = str(number) + with open(full_path, mode="w", encoding="utf-8", errors="strict") as f: + fcntl.lockf(f, fcntl.LOCK_EX) + f.write(txt) + + +def _inc_number(filename): + full_path = os.path.join(os.environ["MODEL_LOG_DIR"], filename) + try: + with open(full_path, mode="r+", encoding="utf-8", errors="strict") as f: + fcntl.lockf(f, fcntl.LOCK_EX) + txt = f.read() + number = int(txt) + 1 + txt = str(number) + f.truncate(0) + f.seek(0) + f.write(txt) + except FileNotFoundError: + number = 1 + _store_number(filename, number) + return number + + +# +# Functions for communicating initialize and finalize count between the model +# and test +# + + +def _get_count_filename(kind): + if kind != "initialize" and kind != "finalize": + raise KeyError("Invalid count kind: " + str(kind)) + filename = _model_name + "_" + kind + "_count.txt" + return filename + + +def get_count(kind): + return _get_number(_get_count_filename(kind)) + + +def inc_count(kind): + return _inc_number(_get_count_filename(kind)) + + +def reset_count(kind): + count = 0 + _store_number(_get_count_filename(kind), count) + return count + + +# +# Functions for communicating varies of delay (in seconds) to the model +# + + +def _get_delay_filename(kind): + if kind != "initialize" and kind != "infer": + raise KeyError("Invalid delay kind: " + str(kind)) + filename = _model_name + "_" + kind + "_delay.txt" + return filename + + +def get_delay(kind): + return _get_number(_get_delay_filename(kind)) + + +def set_delay(kind, delay): + _store_number(_get_delay_filename(kind), delay) + return delay + + +# +# Functions for modifying the model +# + + +def update_instance_group(instance_group_str): + full_path = os.path.join(os.path.dirname(__file__), "config.pbtxt") + with open(full_path, mode="r+", encoding="utf-8", errors="strict") as f: + txt = f.read() + txt, post_match = txt.split("instance_group [") + txt += "instance_group [\n" + txt += instance_group_str + txt += "\n] # end instance_group\n" + txt += post_match.split("\n] # end instance_group\n")[1] + f.truncate(0) + f.seek(0) + f.write(txt) + return txt + + +def update_sequence_batching(sequence_batching_str): + full_path = os.path.join(os.path.dirname(__file__), "config.pbtxt") + with open(full_path, mode="r+", encoding="utf-8", errors="strict") as f: + txt = f.read() + if "sequence_batching {" in txt: + txt, post_match = txt.split("sequence_batching {") + if sequence_batching_str != "": + txt += "sequence_batching {\n" + txt += sequence_batching_str + txt += "\n} # end sequence_batching\n" + txt += post_match.split("\n} # end sequence_batching\n")[1] + elif sequence_batching_str != "": + txt += "\nsequence_batching {\n" + txt += sequence_batching_str + txt += "\n} # end sequence_batching\n" + f.truncate(0) + f.seek(0) + f.write(txt) + return txt + + +def update_model_file(): + full_path = os.path.join(os.path.dirname(__file__), "1", "model.py") + with open(full_path, mode="a", encoding="utf-8", errors="strict") as f: + f.write("\n# dummy model file update\n") + + +def enable_batching(): + full_path = os.path.join(os.path.dirname(__file__), "config.pbtxt") + with open(full_path, mode="r+", encoding="utf-8", errors="strict") as f: + txt = f.read() + txt = txt.replace("max_batch_size: 0", "max_batch_size: 2") + f.truncate(0) + f.seek(0) + f.write(txt) + return txt + + +def disable_batching(): + full_path = os.path.join(os.path.dirname(__file__), "config.pbtxt") + with open(full_path, mode="r+", encoding="utf-8", errors="strict") as f: + txt = f.read() + txt = txt.replace("max_batch_size: 2", "max_batch_size: 0") + f.truncate(0) + f.seek(0) + f.write(txt) + return txt diff --git a/qa/python_models/multi_file/file1.py b/qa/python_models/multi_file/file1.py new file mode 100755 index 0000000000..46b6d76934 --- /dev/null +++ b/qa/python_models/multi_file/file1.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FILE_NAME = "FILE1" diff --git a/qa/python_models/multi_file/file2.py b/qa/python_models/multi_file/file2.py new file mode 100755 index 0000000000..b7174da748 --- /dev/null +++ b/qa/python_models/multi_file/file2.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +FILE_NAME = "FILE2" diff --git a/qa/python_models/multi_file/model.py b/qa/python_models/multi_file/model.py new file mode 100644 index 0000000000..b94d6f336f --- /dev/null +++ b/qa/python_models/multi_file/model.py @@ -0,0 +1,39 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import file1 +import triton_python_backend_utils as pb_utils + +from . import file2 + + +class TritonPythonModel: + def initialize(self, args): + if file1.FILE_NAME != "FILE1" or file2.FILE_NAME != "FILE2": + raise pb_utils.TritonModelException("Imports do not work") + + def execute(self, requests): + pass diff --git a/qa/python_models/non_contiguous/config.pbtxt b/qa/python_models/non_contiguous/config.pbtxt new file mode 100644 index 0000000000..530c8cbf99 --- /dev/null +++ b/qa/python_models/non_contiguous/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "non_contiguous" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1, -1, -1, -1, -1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1, -1, -1, -1, -1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ -1, -1, -1, -1, -1 ] + }, + { + name: "OUTPUT2" + data_type: TYPE_FP32 + dims: [ -1, -1, -1, -1, -1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/non_contiguous/model.py b/qa/python_models/non_contiguous/model.py new file mode 100644 index 0000000000..de7417303b --- /dev/null +++ b/qa/python_models/non_contiguous/model.py @@ -0,0 +1,46 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + responses = [] + new_shape = [10, 2, 6, 5, 11] + shape_reorder = [1, 0, 4, 2, 3] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + input_numpy = input_tensor.as_numpy() + output0 = pb_utils.Tensor("OUTPUT0", input_numpy.reshape(new_shape)) + # Transpose the tensor to create a non-contiguous tensor. + output1 = pb_utils.Tensor("OUTPUT1", input_numpy.T) + output2 = pb_utils.Tensor( + "OUTPUT2", np.transpose(input_numpy, shape_reorder) + ) + responses.append(pb_utils.InferenceResponse([output0, output1, output2])) + return responses diff --git a/qa/python_models/optional/config.pbtxt b/qa/python_models/optional/config.pbtxt new file mode 100644 index 0000000000..c681ec807f --- /dev/null +++ b/qa/python_models/optional/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "optional" +backend: "python" +max_batch_size: 0 +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "INPUT1" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "OUTPUT1" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] diff --git a/qa/python_models/optional/model.py b/qa/python_models/optional/model.py new file mode 100644 index 0000000000..f0a790b43a --- /dev/null +++ b/qa/python_models/optional/model.py @@ -0,0 +1,56 @@ +# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """Model supporting optional inputs. If the input is not provided, an + input tensor of size 1 containing scalar 5 will be used.""" + responses = [] + for request in requests: + input0_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + input1_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + if input0_tensor is not None: + input0_numpy = input0_tensor.as_numpy() + else: + input0_numpy = np.array([5], dtype=np.int32) + + if input1_tensor is not None: + input1_numpy = input1_tensor.as_numpy() + else: + input1_numpy = np.array([5], dtype=np.int32) + + output0_tensor = pb_utils.Tensor("OUTPUT0", input0_numpy + input1_numpy) + output1_tensor = pb_utils.Tensor("OUTPUT1", input0_numpy - input1_numpy) + responses.append( + pb_utils.InferenceResponse([output0_tensor, output1_tensor]) + ) + + return responses diff --git a/qa/python_models/python_based_backends/add_sub_backend/model.py b/qa/python_models/python_based_backends/add_sub_backend/model.py new file mode 100644 index 0000000000..7c9736b2d5 --- /dev/null +++ b/qa/python_models/python_based_backends/add_sub_backend/model.py @@ -0,0 +1,162 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os + +import triton_python_backend_utils as pb_utils + +_ADD_SUB_ARGS_FILENAME = "model.json" + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + """This function is called only once when loading the model assuming + the server was not started with `--disable-auto-complete-config`. + + Parameters + ---------- + auto_complete_model_config : pb_utils.ModelConfig + An object containing the existing model configuration. + + Returns + ------- + pb_utils.ModelConfig + An object containing the auto-completed model configuration + """ + inputs = [ + {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}, + {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}, + ] + outputs = [{"name": "OUTPUT", "data_type": "TYPE_FP32", "dims": [4]}] + + config = auto_complete_model_config.as_dict() + input_names = [] + output_names = [] + + for input in config["input"]: + input_names.append(input["name"]) + + for output in config["output"]: + output_names.append(output["name"]) + + for input in inputs: + if input["name"] not in input_names: + auto_complete_model_config.add_input(input) + + for output in outputs: + if output["name"] not in output_names: + auto_complete_model_config.add_output(output) + + return auto_complete_model_config + + def initialize(self, args): + """This function allows the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + self.model_config = model_config = json.loads(args["model_config"]) + + # Get OUTPUT configuration + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") + + engine_args_filepath = os.path.join( + pb_utils.get_model_dir(), _ADD_SUB_ARGS_FILENAME + ) + assert os.path.isfile( + engine_args_filepath + ), f"'{_ADD_SUB_ARGS_FILENAME}' containing add sub model args must be provided in '{pb_utils.get_model_dir()}'" + + with open(engine_args_filepath) as file: + self.add_sub_config = json.load(file) + + assert ( + "operation" in self.add_sub_config + ), f"Missing required key 'operation' in {_ADD_SUB_ARGS_FILENAME}" + + extra_keys = set(self.add_sub_config.keys()) - {"operation"} + assert ( + not extra_keys + ), f"Unsupported keys are provided in {_ADD_SUB_ARGS_FILENAME}: {', '.join(extra_keys)}" + + assert self.add_sub_config["operation"] in [ + "add", + "sub", + ], f"'operation' value must be 'add' or 'sub' in {_ADD_SUB_ARGS_FILENAME}" + + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + def execute(self, requests): + """This function is called when an inference request is made + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + if self.add_sub_config["operation"] == "add": + out = in_0.as_numpy() + in_1.as_numpy() + else: + out = in_0.as_numpy() - in_1.as_numpy() + + # Create output tensors. + out_tensor = pb_utils.Tensor("OUTPUT", out.astype(self.output_dtype)) + + # Create InferenceResponse. + inference_response = pb_utils.InferenceResponse(output_tensors=[out_tensor]) + responses.append(inference_response) + + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded.""" + print("Cleaning up...") diff --git a/qa/python_models/python_version/config.pbtxt b/qa/python_models/python_version/config.pbtxt new file mode 100644 index 0000000000..af520e0771 --- /dev/null +++ b/qa/python_models/python_version/config.pbtxt @@ -0,0 +1,29 @@ +# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "python_version" + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/python_version/model.py b/qa/python_models/python_version/model.py new file mode 100644 index 0000000000..5d77906fa9 --- /dev/null +++ b/qa/python_models/python_version/model.py @@ -0,0 +1,69 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import locale +import os +import sys + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + input = {"name": "INPUT", "data_type": "TYPE_FP32", "dims": [1]} + output = {"name": "OUTPUT", "data_type": "TYPE_FP32", "dims": [1]} + + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.add_input(input) + auto_complete_model_config.add_output(output) + + return auto_complete_model_config + + def initialize(self, args): + import tensorflow + + self.model_config = args["model_config"] + # This is to make sure that /bin/bash is not picking up + # the wrong shared libraries after installing Tensorflow. + # Tensorflow uses a shared library which is common with + # bash. + os.system("/bin/bash --help") + print( + f"Python version is {sys.version_info.major}.{sys.version_info.minor}, NumPy version is {np.version.version}, and Tensorflow version is {tensorflow.__version__}", + flush=True, + ) + print(f"Locale is {locale.getlocale()}", flush=True) + + def execute(self, requests): + """This function is called on inference request.""" + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/pytorch_fp32_fp32/config.pbtxt b/qa/python_models/pytorch_fp32_fp32/config.pbtxt new file mode 100644 index 0000000000..75e625bf80 --- /dev/null +++ b/qa/python_models/pytorch_fp32_fp32/config.pbtxt @@ -0,0 +1,51 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "pytorch_model" +backend: "python" + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ 1, 1, 28, 28 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ 1, 10 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/pytorch_fp32_fp32/model.py b/qa/python_models/pytorch_fp32_fp32/model.py new file mode 100644 index 0000000000..98269213b2 --- /dev/null +++ b/qa/python_models/pytorch_fp32_fp32/model.py @@ -0,0 +1,77 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton_python_backend_utils as pb_utils + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout2d(0.25) + self.dropout2 = nn.Dropout2d(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +class TritonPythonModel: + def initialize(self, args): + torch.manual_seed(0) + self.model = Net() + self.model.eval() + + def execute(self, requests): + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + # This tensor is read-only, we need to make a copy + input_data_ro = input_tensor.as_numpy() + input_data = np.array(input_data_ro) + result = self.model(torch.tensor(input_data)) + + out_tensor = pb_utils.Tensor("OUT", result.detach().numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/request_rescheduling_addsub/config.pbtxt b/qa/python_models/request_rescheduling_addsub/config.pbtxt new file mode 100644 index 0000000000..7667bfb3c0 --- /dev/null +++ b/qa/python_models/request_rescheduling_addsub/config.pbtxt @@ -0,0 +1,61 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "request_rescheduling_addsub" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +sequence_batching { + iterative_sequence : true +} +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/request_rescheduling_addsub/model.py b/qa/python_models/request_rescheduling_addsub/model.py new file mode 100644 index 0000000000..fb7b0ac9c7 --- /dev/null +++ b/qa/python_models/request_rescheduling_addsub/model.py @@ -0,0 +1,82 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + self.idx = 0 + + def execute(self, requests): + """This function is called on inference request.""" + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0, out_tensor_1] + ) + + # Explicitly reschedule the first request + if self.idx == 0: + request.set_release_flags( + pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE + ) + responses.append(None) + self.idx += 1 + else: + responses.append(inference_response) + + return responses diff --git a/qa/python_models/response_sender/config.pbtxt b/qa/python_models/response_sender/config.pbtxt new file mode 100644 index 0000000000..ef0c29e3bf --- /dev/null +++ b/qa/python_models/response_sender/config.pbtxt @@ -0,0 +1,65 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 8 + +input [ + { + name: "NUMBER_OF_RESPONSE_BEFORE_RETURN" + data_type: TYPE_UINT8 + dims: [ 1 ] + }, + { + name: "SEND_COMPLETE_FINAL_FLAG_BEFORE_RETURN" + data_type: TYPE_BOOL + dims: [ 1 ] + }, + { + name: "RETURN_A_RESPONSE" + data_type: TYPE_BOOL + dims: [ 1 ] + }, + { + name: "NUMBER_OF_RESPONSE_AFTER_RETURN" + data_type: TYPE_UINT8 + dims: [ 1 ] + }, + { + name: "SEND_COMPLETE_FINAL_FLAG_AFTER_RETURN" + data_type: TYPE_BOOL + dims: [ 1 ] + } +] +output [ + { + name: "INDEX" + data_type: TYPE_UINT16 + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/response_sender/model.py b/qa/python_models/response_sender/model.py new file mode 100644 index 0000000000..8749b83ee8 --- /dev/null +++ b/qa/python_models/response_sender/model.py @@ -0,0 +1,37 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import triton_python_backend_utils as pb_utils +from model_common import ResponseSenderModelCommon + + +class TritonPythonModel: + def initialize(self, args): + self._common = ResponseSenderModelCommon(pb_utils) + + def execute(self, requests): + return self._common.execute(requests, use_async=False) diff --git a/qa/python_models/response_sender/model_async.py b/qa/python_models/response_sender/model_async.py new file mode 100644 index 0000000000..b12eccef06 --- /dev/null +++ b/qa/python_models/response_sender/model_async.py @@ -0,0 +1,37 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import triton_python_backend_utils as pb_utils +from model_common import ResponseSenderModelCommon + + +class TritonPythonModel: + def initialize(self, args): + self._common = ResponseSenderModelCommon(pb_utils) + + async def execute(self, requests): + return self._common.execute(requests, use_async=True) diff --git a/qa/python_models/response_sender/model_common.py b/qa/python_models/response_sender/model_common.py new file mode 100644 index 0000000000..0e676e0d82 --- /dev/null +++ b/qa/python_models/response_sender/model_common.py @@ -0,0 +1,210 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import asyncio +import threading +import time + +import numpy as np + + +class ResponseSenderModelCommon: + def __init__(self, pb_utils): + self._pb_utils = pb_utils + self._background_tasks = set() + + def _get_instructions_from_request(self, request): + """ + Determine the execution instructions from the inputs. This test tries to examine + all the corner cases with using response sender. + + Assumptions: The request batch size can be larger than one. + + There are 5 inputs in the model that control the model behavior: + * NUMBER_OF_RESPONSE_BEFORE_RETURN (UINT8): + Determines the number of responses before returning from execute function. + * SEND_COMPLETE_FINAL_FLAG_BEFORE_RETURN (BOOL): + Determines whether the final flag will be sent before return. + * RETURN_A_RESPONSE (BOOL): + Return the response when the model is returning from `execute` function. + * NUMBER_OF_RESPONSE_AFTER_RETURN (UINT8): + Determines the number of responses after return. + * SEND_COMPLETE_FINAL_FLAG_AFTER_RETURN (BOOL): + Determines whether the final flag will be sent after return. + + Note: + * If the batch size of a request is larger than one, the sum of the values in + the batch will be used for determining the value of each input of the + request. + * The response_id is used to determine the difference between responses sent + during execute, when execute returns, or after execute returns. + """ + instr = {} + return_a_response_np = self._pb_utils.get_input_tensor_by_name( + request, "RETURN_A_RESPONSE" + ).as_numpy() + instr["batch_size"] = return_a_response_np.shape[0] + instr["return_a_response"] = bool(return_a_response_np.sum()) + instr["number_of_pre_return_response"] = ( + self._pb_utils.get_input_tensor_by_name( + request, "NUMBER_OF_RESPONSE_BEFORE_RETURN" + ) + .as_numpy() + .sum() + ) + instr["number_of_post_return_response"] = ( + self._pb_utils.get_input_tensor_by_name( + request, "NUMBER_OF_RESPONSE_AFTER_RETURN" + ) + .as_numpy() + .sum() + ) + instr["send_complete_final_flag_pre_return"] = bool( + self._pb_utils.get_input_tensor_by_name( + request, "SEND_COMPLETE_FINAL_FLAG_BEFORE_RETURN" + ) + .as_numpy() + .sum() + ) + instr["send_complete_final_flag_post_return"] = bool( + self._pb_utils.get_input_tensor_by_name( + request, "SEND_COMPLETE_FINAL_FLAG_AFTER_RETURN" + ) + .as_numpy() + .sum() + ) + return instr + + def _is_response_sender_needed(self, instr): + return ( + instr["number_of_pre_return_response"] > 0 + or instr["number_of_post_return_response"] > 0 + or instr["send_complete_final_flag_pre_return"] + or instr["send_complete_final_flag_post_return"] + ) + + def _create_response(self, batch_size, response_id): + output_tensor = self._pb_utils.Tensor( + "INDEX", np.array([[response_id] for _ in range(batch_size)], np.uint16) + ) + response = self._pb_utils.InferenceResponse(output_tensors=[output_tensor]) + return response + + def _send_responses(self, processed_requests, response_id_offset): + for request in processed_requests: + number_of_response = request["number_of_response"] + batch_size = request["batch_size"] + response_sender = request["response_sender"] + send_complete_final_flag = request["send_complete_final_flag"] + for response_id in range(number_of_response): + response_sender.send( + self._create_response( + batch_size, response_id=(response_id_offset + response_id) + ) + ) + if send_complete_final_flag: + response_sender.send( + flags=self._pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + + def _send_responses_delayed_threaded(self, processed_requests, response_id_offset): + def response_thread(send_responses, processed_requests, response_id_offset): + time.sleep(0.5) # response after requests are released + send_responses(processed_requests, response_id_offset) + + thread = threading.Thread( + target=response_thread, + args=(self._send_responses, processed_requests, response_id_offset), + ) + thread.daemon = True + thread.start() + + def _send_responses_delayed_async(self, processed_requests, response_id_offset): + async def response_async( + send_responses, processed_requests, response_id_offset + ): + await asyncio.sleep(0.5) # response after requests are released + send_responses(processed_requests, response_id_offset) + + coro = response_async( + self._send_responses, processed_requests, response_id_offset + ) + task = asyncio.create_task(coro) + self._background_tasks.add(task) + task.add_done_callback(self._background_tasks.discard) + + def execute(self, requests, use_async): + pre_return_processed_requests = [] + return_responses = [] + post_return_processed_requests = [] + + for request in requests: + instr = self._get_instructions_from_request(request) + + response_sender = None + if self._is_response_sender_needed(instr): + response_sender = request.get_response_sender() + + pre_return_processed_requests.append( + { + "number_of_response": instr["number_of_pre_return_response"], + "batch_size": instr["batch_size"], + "response_sender": response_sender, + "send_complete_final_flag": instr[ + "send_complete_final_flag_pre_return" + ], + } + ) + post_return_processed_requests.append( + { + "number_of_response": instr["number_of_post_return_response"], + "batch_size": instr["batch_size"], + "response_sender": response_sender, + "send_complete_final_flag": instr[ + "send_complete_final_flag_post_return" + ], + } + ) + + response = None + if instr["return_a_response"]: + response = self._create_response(instr["batch_size"], response_id=0) + return_responses.append(response) + + self._send_responses(pre_return_processed_requests, response_id_offset=1000) + + if use_async: + self._send_responses_delayed_async( + post_return_processed_requests, response_id_offset=2000 + ) + else: + self._send_responses_delayed_threaded( + post_return_processed_requests, response_id_offset=2000 + ) + + if return_responses == [None for _ in requests]: + return None + return return_responses diff --git a/qa/python_models/response_sender_complete_final/config.pbtxt b/qa/python_models/response_sender_complete_final/config.pbtxt new file mode 100644 index 0000000000..f08ed6da5b --- /dev/null +++ b/qa/python_models/response_sender_complete_final/config.pbtxt @@ -0,0 +1,47 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [{ kind: KIND_CPU }] +model_transaction_policy { decoupled: True } diff --git a/qa/python_models/response_sender_complete_final/model.py b/qa/python_models/response_sender_complete_final/model.py new file mode 100644 index 0000000000..e17f0b04f6 --- /dev/null +++ b/qa/python_models/response_sender_complete_final/model.py @@ -0,0 +1,63 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + # Expect exactly one request per execute() call. + if len(requests) != 1: + pb_utils.Logger.log_error(f"Unexpected request length: {len(requests)}") + raise Exception("Test FAILED") + + # Send a response with complete final flag, and then send another response and + # and assert an exception is raised, for all requests. + for request in requests: + in_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", in_tensor.as_numpy()) + response = pb_utils.InferenceResponse([out_tensor]) + response_sender = request.get_response_sender() + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + test_passed = False + try: + response_sender.send(response) + except Exception as e: + pb_utils.Logger.log_info(f"Raised exception: {e}") + if ( + str(e) + == "Unable to send response. Response sender has been closed." + ): + test_passed = True + finally: + if not test_passed: + pb_utils.Logger.log_error("Expected exception not raised") + raise Exception("Test FAILED") + pb_utils.Logger.log_info("Test Passed") + return None diff --git a/qa/python_models/response_sender_error/config.pbtxt b/qa/python_models/response_sender_error/config.pbtxt new file mode 100644 index 0000000000..168839c241 --- /dev/null +++ b/qa/python_models/response_sender_error/config.pbtxt @@ -0,0 +1,59 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "response_sender_error" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +input [ + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/response_sender_error/model.py b/qa/python_models/response_sender_error/model.py new file mode 100644 index 0000000000..4f1e0e5e85 --- /dev/null +++ b/qa/python_models/response_sender_error/model.py @@ -0,0 +1,75 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model tries to create a response sender in + a model that is not configured with decoupled + model transaction policy. + """ + + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """Tries to create a response sender object and use that + for sending the response. + """ + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + for request in requests: + response_sender = request.get_response_sender() + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + response_sender.send( + pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]) + ) + response_sender.close() + + return None diff --git a/qa/python_models/sequence_int32/config.pbtxt b/qa/python_models/sequence_int32/config.pbtxt new file mode 100644 index 0000000000..fb9236b347 --- /dev/null +++ b/qa/python_models/sequence_int32/config.pbtxt @@ -0,0 +1,80 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "python_nobatch_sequence_int32" +backend: "python" +max_batch_size: 0 +version_policy: { latest { num_versions: 1 }} + + +instance_group [ + { + kind: KIND_GPU +count: 4 + } +] + + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + + + } +] +sequence_batching { + max_sequence_idle_microseconds: 5000000 + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + int32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + int32_false_true: [ 0, 1 ] + } + ] + } + ] +} diff --git a/qa/python_models/sequence_int32/model.py b/qa/python_models/sequence_int32/model.py new file mode 100644 index 0000000000..445cb5b13e --- /dev/null +++ b/qa/python_models/sequence_int32/model.py @@ -0,0 +1,92 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") + + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + self.accumulator = np.zeros(1) + self.max_batch_size = model_config["max_batch_size"] + + def execute(self, requests): + """ + This function is called on inference request. + It is derived from "create_tf_modelfile" in + common/gen_qa_sequence_models.py and maintains + a true accumulator when the max batch size is 0 + + """ + output_dtype = self.output_dtype + + responses = [] + for request in requests: + input_tensor = ( + pb_utils.get_input_tensor_by_name(request, "INPUT") + .as_numpy() + .astype(np.int32) + ) + start_tensor = ( + pb_utils.get_input_tensor_by_name(request, "START") + .as_numpy() + .astype(np.int32) + ) + ready_tensor = ( + pb_utils.get_input_tensor_by_name(request, "READY") + .as_numpy() + .astype(np.int32) + ) + + if self.max_batch_size == 0: + tmp = np.where( + np.equal(start_tensor, 1), + input_tensor, + np.add(self.accumulator, input_tensor), + ) + newacc = np.where(np.equal(ready_tensor, 1), tmp, self.accumulator) + self.accumulator = newacc + out_tensor = pb_utils.Tensor( + "OUTPUT", self.accumulator.astype(output_dtype) + ) + else: + tmp = np.where( + np.equal(ready_tensor, 1), + np.add(start_tensor, input_tensor), + np.zeros(np.shape(input_tensor), dtype=output_dtype), + ) + out_tensor = pb_utils.Tensor("OUTPUT", tmp.astype(output_dtype)) + + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/sequence_py/config.pbtxt b/qa/python_models/sequence_py/config.pbtxt new file mode 100644 index 0000000000..b58796058d --- /dev/null +++ b/qa/python_models/sequence_py/config.pbtxt @@ -0,0 +1,53 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 4 + +input [ + { + name: "INPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +sequence_batching { + oldest { + max_candidate_sequences: 4 + max_queue_delay_microseconds: 1000000 + preserve_ordering: False + } + max_sequence_idle_microseconds: 10000000 +} diff --git a/qa/python_models/sequence_py/model.py b/qa/python_models/sequence_py/model.py new file mode 100644 index 0000000000..b375af3e30 --- /dev/null +++ b/qa/python_models/sequence_py/model.py @@ -0,0 +1,93 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + self.sequences = {} + self.decoupled = self.model_config.get("model_transaction_policy", {}).get( + "decoupled" + ) + + def get_next_sequence_output_tensor(self, request): + sid = request.correlation_id() + flags = request.flags() + if flags == pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START: + if sid in self.sequences: + raise pb_utils.TritonModelException( + "Can't start a new sequence with existing ID" + ) + self.sequences[sid] = [1] + else: + if sid not in self.sequences: + raise pb_utils.TritonModelException( + "Need START flag for a sequence ID that doesn't already exist." + ) + + last = self.sequences[sid][-1] + self.sequences[sid].append(last + 1) + + output = self.sequences[sid][-1] + output = np.array([output]) + out_tensor = pb_utils.Tensor("OUTPUT0", output.astype(np.int32)) + return out_tensor + + def execute(self, requests): + if self.decoupled: + return self.execute_decoupled(requests) + else: + return self.execute_non_decoupled(requests) + + def execute_non_decoupled(self, requests): + responses = [] + for request in requests: + output_tensor = self.get_next_sequence_output_tensor(request) + response = pb_utils.InferenceResponse([output_tensor]) + responses.append(response) + return responses + + def execute_decoupled(self, requests): + for request in requests: + sender = request.get_response_sender() + output_tensor = self.get_next_sequence_output_tensor(request) + + # Send 3 responses per request + for _ in range(3): + response = pb_utils.InferenceResponse([output_tensor]) + sender.send(response) + + sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + + return None + + def finalize(self): + print(f"Cleaning up. Final sequences stored: {self.sequences}") diff --git a/qa/python_models/simple_identity_fp32/config.pbtxt b/qa/python_models/simple_identity_fp32/config.pbtxt new file mode 100644 index 0000000000..cc5931ad63 --- /dev/null +++ b/qa/python_models/simple_identity_fp32/config.pbtxt @@ -0,0 +1,62 @@ +# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "simple_identity_fp32" +platform: "ensemble" +max_batch_size: 64 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +ensemble_scheduling { + step [ + { + model_name: "identity_fp32" + model_version: -1 + input_map { + key: "INPUT0" + value: "INPUT0" + } + output_map { + key: "OUTPUT0" + value: "OUTPUT0" + } + } + ] +} diff --git a/qa/python_models/string/config.pbtxt b/qa/python_models/string/config.pbtxt new file mode 100644 index 0000000000..279c6b5097 --- /dev/null +++ b/qa/python_models/string/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "string" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/string/model.py b/qa/python_models/string/model.py new file mode 100644 index 0000000000..5e419d965a --- /dev/null +++ b/qa/python_models/string/model.py @@ -0,0 +1,49 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model loops through different dtypes to make sure that + serialize_byte_tensor works correctly in the Python backend. + """ + + def initialize(self, args): + self._index = 0 + self._dtypes = [np.bytes_, np.object_] + + def execute(self, requests): + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", in_0.as_numpy().astype(self._dtypes[self._index]) + ) + self._index += 1 + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + return responses diff --git a/qa/python_models/string_fixed/config.pbtxt b/qa/python_models/string_fixed/config.pbtxt new file mode 100644 index 0000000000..c08783f5dc --- /dev/null +++ b/qa/python_models/string_fixed/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "string_fixed" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ -1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/string_fixed/model.py b/qa/python_models/string_fixed/model.py new file mode 100644 index 0000000000..d6e23eccb8 --- /dev/null +++ b/qa/python_models/string_fixed/model.py @@ -0,0 +1,63 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """ + This model returns a constant string on every inference request. + """ + + def initialize(self, args): + self._index = 0 + self._dtypes = [np.bytes_, np.object_] + + def execute(self, requests): + # Create four different responses (empty string or fixed string) * (two + # datatypes) + responses = [] + for _ in requests: + if self._index == 0: + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", np.array(["123456"], dtype=self._dtypes[0]) + ) + elif self._index == 1: + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", np.array([], dtype=self._dtypes[1]) + ) + elif self._index == 2: + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", np.array(["123456"], dtype=self._dtypes[0]) + ) + elif self._index == 3: + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", np.array([], dtype=self._dtypes[1]) + ) + self._index += 1 + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + return responses diff --git a/qa/python_models/string_identity/config.pbtxt b/qa/python_models/string_identity/config.pbtxt new file mode 100644 index 0000000000..55ea21d3d4 --- /dev/null +++ b/qa/python_models/string_identity/config.pbtxt @@ -0,0 +1,46 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "string_identity" +backend: "python" +max_batch_size: 0 + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/string_identity/model.py b/qa/python_models/string_identity/model.py new file mode 100644 index 0000000000..0288b129bc --- /dev/null +++ b/qa/python_models/string_identity/model.py @@ -0,0 +1,48 @@ +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import sys + +sys.path.append("../../") +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + """This model always returns the input that it has received.""" + + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + + def execute(self, requests): + """This function is called on inference request.""" + + responses = [] + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor_0 = pb_utils.Tensor("OUTPUT0", in_0.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor_0])) + return responses diff --git a/qa/python_models/sub_add/model.py b/qa/python_models/sub_add/model.py new file mode 100644 index 0000000000..8ac679c86f --- /dev/null +++ b/qa/python_models/sub_add/model.py @@ -0,0 +1,78 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import sys + +import numpy as np + +sys.path.append("../../") +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config["data_type"] + ) + + def execute(self, requests): + """This function is called on inference request.""" + + output0_dtype = self.output0_dtype + output1_dtype = self.output1_dtype + + responses = [] + for request in requests: + input_tensors = request.inputs() + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + ) + else: + out_0, out_1 = ( + in_0.as_numpy() - in_1.as_numpy(), + in_0.as_numpy() + in_1.as_numpy(), + ) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) + return responses diff --git a/qa/python_models/torchvision/resnet50/config.pbtxt b/qa/python_models/torchvision/resnet50/config.pbtxt new file mode 100644 index 0000000000..fdbc7c7de9 --- /dev/null +++ b/qa/python_models/torchvision/resnet50/config.pbtxt @@ -0,0 +1,40 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "resnet50_python" +backend: "python" +max_batch_size: 128 +input { + name: "INPUT0" + data_type: TYPE_FP32 + format: FORMAT_NCHW + dims: [ 3, 224, 224 ] + } +output { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 1000 ] + } diff --git a/qa/python_models/torchvision/resnet50/model.py b/qa/python_models/torchvision/resnet50/model.py new file mode 100644 index 0000000000..6a31a77420 --- /dev/null +++ b/qa/python_models/torchvision/resnet50/model.py @@ -0,0 +1,64 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack + + +class TritonPythonModel: + def initialize(self, args): + """ + This function initializes pre-trained ResNet50 model. + """ + self.device = "cuda" if args["model_instance_kind"] == "GPU" else "cpu" + # Avoid the "HTTP Error 403: rate limit exceeded" error + torch.hub._validate_not_a_forked_repo = lambda a, b, c: True + # Our tests currently depend on torchvision=0.14, + # to make sure `torch.hub` loads Resnet50 implementation + # compatible with torchvision=0.14, we need to provide tag + self.model = ( + torch.hub.load( + "pytorch/vision:v0.14.1", "resnet50", weights="IMAGENET1K_V2" + ) + .to(self.device) + .eval() + ) + + def execute(self, requests): + """ + This function receives a list of requests (`pb_utils.InferenceRequest`), + performs inference on every request and appends it to responses. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + result = self.model( + torch.as_tensor(input_tensor.as_numpy(), device=self.device) + ) + out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(result)) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/variable_gpu_output/config.pbtxt b/qa/python_models/variable_gpu_output/config.pbtxt new file mode 100644 index 0000000000..8fe69444f7 --- /dev/null +++ b/qa/python_models/variable_gpu_output/config.pbtxt @@ -0,0 +1,55 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "variable_gpu_output" +backend: "python" +max_batch_size: 256 + +input [ + { + name: "INPUT" + data_type: TYPE_FP32 + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +dynamic_batching { + max_queue_delay_microseconds: 1000000 +} + +instance_group [ + { + count: 1 + kind: KIND_GPU + } +] diff --git a/qa/python_models/variable_gpu_output/model.py b/qa/python_models/variable_gpu_output/model.py new file mode 100644 index 0000000000..2da2a3cbd2 --- /dev/null +++ b/qa/python_models/variable_gpu_output/model.py @@ -0,0 +1,46 @@ +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack + + +class TritonPythonModel: + def execute(self, requests): + # The client will send 5 requests + assert len(requests) == 5 + responses = [] + for i, request in enumerate(requests): + # Create an (i+1)-element array with all the tensors equal to (i+1) + output = torch.ones(i + 1, dtype=torch.float32, device="cuda") + output = output * (i + 1) + output_pb_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", to_dlpack(output)) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_pb_tensor] + ) + responses.append(inference_response) + return responses diff --git a/qa/python_models/wrong_model/config.pbtxt b/qa/python_models/wrong_model/config.pbtxt new file mode 100644 index 0000000000..a9d05275a8 --- /dev/null +++ b/qa/python_models/wrong_model/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "identity_fp32" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/wrong_model/model.py b/qa/python_models/wrong_model/model.py new file mode 100644 index 0000000000..2cac72324f --- /dev/null +++ b/qa/python_models/wrong_model/model.py @@ -0,0 +1,42 @@ +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + This model ensures that errors in the execute function are properly + handles. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + lorem_ipsum + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/qa/python_models/wrong_return_type/config.pbtxt b/qa/python_models/wrong_return_type/config.pbtxt new file mode 100644 index 0000000000..e34905e635 --- /dev/null +++ b/qa/python_models/wrong_return_type/config.pbtxt @@ -0,0 +1,49 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "wrong_return_type" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] + +sequence_batching { + iterative_sequence : true +} + +instance_group [{ kind: KIND_CPU }] diff --git a/qa/python_models/wrong_return_type/model.py b/qa/python_models/wrong_return_type/model.py new file mode 100644 index 0000000000..c5e6f660fc --- /dev/null +++ b/qa/python_models/wrong_return_type/model.py @@ -0,0 +1,67 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = model_config = json.loads(args["model_config"]) + + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + + def execute(self, requests): + output0_dtype = self.output0_dtype + + responses = [] + + for request in requests: + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + + out_0 = in_0.as_numpy() + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0] + ) + + request.set_release_flags(pb_utils.TRITONSERVER_REQUEST_RELEASE_RESCHEDULE) + # Should append `None` for rescheduled requests. + responses.append(inference_response) + + return responses + + def finalize(self): + pass diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..9488fc6233 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,795 @@ +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required (VERSION 3.18) + +project(tritonserverexe LANGUAGES C CXX) + +include(GNUInstallDirs) + +# +# Dependencies +# +# We must include the transitive closure of all repos so that we can +# override the tag. The backend repo is needed for the tests. +# +include(FetchContent) + +FetchContent_Declare( + repo-common + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git + GIT_TAG ${TRITON_COMMON_REPO_TAG} +) +FetchContent_Declare( + repo-core + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git + GIT_TAG ${TRITON_CORE_REPO_TAG} +) +FetchContent_Declare( + repo-backend + GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git + GIT_TAG ${TRITON_BACKEND_REPO_TAG} +) + +if(TRITON_ENABLE_GRPC) + set(TRITON_COMMON_ENABLE_PROTOBUF ON) + set(TRITON_COMMON_ENABLE_GRPC ON) +endif() # TRITON_ENABLE_GRPC + +FetchContent_MakeAvailable(repo-common repo-core repo-backend) + +# CUDA +# +if(${TRITON_ENABLE_GPU}) + find_package(CUDAToolkit REQUIRED) + message(STATUS "Using CUDA ${CUDA_VERSION}") +endif() # TRITON_ENABLE_GPU + +# libevent +# +if(${TRITON_ENABLE_HTTP} OR ${TRITON_ENABLE_METRICS} OR + ${TRITON_ENABLE_SAGEMAKER} OR ${TRITON_ENABLE_VERTEX_AI}) + find_package(Libevent CONFIG REQUIRED) + message(STATUS "Using libevent ${Libevent_VERSION}") +endif() + +# OpenTelemetry +# +if (NOT WIN32 AND ${TRITON_ENABLE_TRACING}) + find_package(absl CONFIG REQUIRED) + find_package(CURL CONFIG REQUIRED) + find_package(nlohmann_json CONFIG REQUIRED) + find_package(opentelemetry-cpp CONFIG REQUIRED) + message(STATUS "Using opentelemetry-cpp ${opentelemetry-cpp_VERSION}") +endif() + +# re2 +# +find_package(re2 REQUIRED) + +# +# tritonserver executable +# +add_executable( + main + classification.cc + command_line_parser.cc + common.cc + main.cc + shared_memory_manager.cc + triton_signal.cc + classification.h + common.h + shared_memory_manager.h + triton_signal.h +) + +# On windows a *.lib file can be generated for a exe. When creating +# tritonserver.exe if we try to create tritonserver.lib it will fail +# because there is already a trtionserver.lib for tritonserver.dll, +# this causes the build to fail. To avoid we keep the build name as +# main.exe and then for windows after installing we rename it to +# tritonserver.exe (below in the install steps). +if (NOT WIN32) + set_property(TARGET main PROPERTY OUTPUT_NAME tritonserver) +endif() + +target_compile_features(main PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + message("Using MSVC as compiler, default target on Windows 10. " + "If the target system is not Windows 10, please update _WIN32_WINNT " + "to corresponding value.") + target_compile_options( + main + PRIVATE + /W1 /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor + ) + target_compile_definitions(main + PRIVATE + NOMINMAX) +else() + target_compile_options( + main + PRIVATE + -Wall -Wextra -Wno-unused-parameter -Wno-deprecated-declarations -Werror + ) +endif() + +set(LIB_DIR "lib") +if(LINUX) + file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE") + if(${DISTRO_ID_LIKE} MATCHES "rhel|centos") + set (LIB_DIR "lib64") + endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos") +endif(LINUX) +set(TRITON_CORE_HEADERS_ONLY OFF) + +set_target_properties( + main + PROPERTIES + POSITION_INDEPENDENT_CODE ON + SKIP_BUILD_RPATH TRUE + BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH FALSE + INSTALL_RPATH "$\{ORIGIN\}/../${LIB_DIR}" +) + +target_link_libraries( + main + PRIVATE + triton-common-async-work-queue # from repo-common + triton-common-error # from repo-common + triton-common-logging # from repo-common + triton-core-serverapi # from repo-core + triton-core-serverstub # from repo-core +) + +if(${TRITON_ENABLE_ASAN}) + set(CMAKE_BUILD_TYPE Debug) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_ASAN=1 + ) + set(_ASAN_FLAGS "-static-libstdc++ -static-libasan -fno-omit-frame-pointer -fsanitize=address") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${_ASAN_FLAGS}") + set(CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} ${_ASAN_FLAGS}") +endif() # TRITON_ENABLE_ASAN + +if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) + + target_link_libraries( + main + PRIVATE + CUDA::cudart + ) +endif() # TRITON_ENABLE_GPU + +if(${TRITON_ENABLE_HTTP} OR ${TRITON_ENABLE_METRICS} OR + ${TRITON_ENABLE_SAGEMAKER} OR ${TRITON_ENABLE_VERTEX_AI}) + target_include_directories( + main + PRIVATE + ${LIBEVENT_INCLUDE_DIRS} + ) +endif() + + +if(${TRITON_ENABLE_HTTP}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_HTTP=1 + ) +endif() # TRITON_ENABLE_HTTP + +if(${TRITON_ENABLE_SAGEMAKER}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_SAGEMAKER=1 + ) +endif() # TRITON_ENABLE_SAGEMAKER + +if(${TRITON_ENABLE_VERTEX_AI}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_VERTEX_AI=1 + ) +endif() # TRITON_ENABLE_VERTEX_AI + +if(${TRITON_ENABLE_LOGGING}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_LOGGING=1 + ) +endif() # TRITON_ENABLE_LOGGING + +if(${TRITON_ENABLE_METRICS}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_METRICS=1 + ) +endif() # TRITON_ENABLE_METRICS + +if(${TRITON_ENABLE_STATS}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_STATS=1 + ) +endif() # TRITON_ENABLE_STATS + +if(${TRITON_ENABLE_TRACING}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_TRACING=1 + ) +# FIXME: remove, when Windows support is added for Opentelemetry + if (NOT WIN32) + target_include_directories( + main + PRIVATE + ${OPENTELEMETRY_CPP_INCLUDE_DIRS} + ) + endif() +endif() # TRITON_ENABLE_TRACING + +if(${TRITON_ENABLE_NVTX}) + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_NVTX=1 + ) +endif() # TRITON_ENABLE_NVTX + +if (NOT WIN32) + target_link_libraries( + main + PRIVATE + rt + dl + ) +endif() # NOT WIN32 + +if (NOT WIN32) + install( + TARGETS main + RUNTIME DESTINATION bin + ) +else() + # See explanation above as to why we need to rename main.exe to + # tritonserver.exe as part of the install process on windows. + install( + PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/main.exe + DESTINATION bin + RENAME tritonserver.exe + ) +endif() + +if(${TRITON_ENABLE_GRPC}) + # + # GRPC + # + find_package(gRPC CONFIG REQUIRED) + message(STATUS "Using gRPC ${gRPC_VERSION}") + + add_subdirectory(grpc) + target_link_libraries( + main + PRIVATE + grpc-endpoint-library + ) + + target_include_directories( + main + PRIVATE + $ + ) + + target_compile_definitions( + main + PRIVATE TRITON_ENABLE_GRPC=1 + ) +endif() + +# http endpoint +# +if(${TRITON_ENABLE_HTTP} + OR ${TRITON_ENABLE_METRICS} + OR ${TRITON_ENABLE_SAGEMAKER} + OR ${TRITON_ENABLE_VERTEX_AI}) + find_package(libevhtp CONFIG REQUIRED) + message(STATUS "Using libevhtp ${libevhtp_VERSION}") + + list(APPEND + HTTP_ENDPOINT_SRCS + http_server.cc + ) + list(APPEND + HTTP_ENDPOINT_HDRS + http_server.h + ) + + # Add header / src files based on HTTP related endpoint requested + if(${TRITON_ENABLE_SAGEMAKER}) + list(APPEND + HTTP_ENDPOINT_SRCS + sagemaker_server.cc + ) + list(APPEND + HTTP_ENDPOINT_HDRS + sagemaker_server.h + ) + endif() # TRITON_ENABLE_SAGEMAKER + + if(${TRITON_ENABLE_VERTEX_AI}) + list(APPEND + HTTP_ENDPOINT_SRCS + vertex_ai_server.cc + ) + list(APPEND + HTTP_ENDPOINT_HDRS + vertex_ai_server.h + ) + endif() # TRITON_ENABLE_VERTEX_AI + + add_library( + http-endpoint-library EXCLUDE_FROM_ALL + ${HTTP_ENDPOINT_SRCS} ${HTTP_ENDPOINT_HDRS} + ) + + target_compile_features(http-endpoint-library PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + target_compile_options( + http-endpoint-library + PRIVATE + /W1 /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor + ) + else() + target_compile_options( + http-endpoint-library + PRIVATE + -Wall -Wextra -Wno-unused-parameter -Wno-deprecated-declarations -Werror + ) + endif() + + set_target_properties( + http-endpoint-library + PROPERTIES + POSITION_INDEPENDENT_CODE ON + ) + + target_link_libraries( + http-endpoint-library + PUBLIC + triton-common-json # from repo-common + triton-common-logging # from repo-common + triton-core-serverapi # from repo-core + triton-core-serverstub # from repo-core + ${LIBEVENT_LIBRARIES} + libevhtp::evhtp + re2::re2 + ) + + target_include_directories( + http-endpoint-library + PRIVATE $ + ) + + # FIXME when Triton support of Opentelemetry is available on Windows + # add ${OPENTELEMETRY_CPP_INCLUDE_DIRS} to above target_include_directories + # JIRA DLIS-4786 + if (NOT WIN32 AND ${TRITON_ENABLE_TRACING}) + target_include_directories( + http-endpoint-library + PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS} + ) + endif() + + if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) + + target_link_libraries( + http-endpoint-library + PUBLIC + CUDA::cudart + ) + endif() # TRITON_ENABLE_GPU + + if(${TRITON_ENABLE_HTTP}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_HTTP=1 + ) + endif() # TRITON_ENABLE_HTTP + + if(${TRITON_ENABLE_SAGEMAKER}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_SAGEMAKER=1 + ) + endif() # TRITON_ENABLE_SAGEMAKER + + if(${TRITON_ENABLE_VERTEX_AI}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_VERTEX_AI=1 + ) + endif() # TRITON_ENABLE_VERTEX_AI + + if(${TRITON_ENABLE_METRICS}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_METRICS=1 + ) + endif() # TRITON_ENABLE_METRICS + + if(${TRITON_ENABLE_LOGGING}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_LOGGING=1 + ) + endif() # TRITON_ENABLE_LOGGING + + if(${TRITON_ENABLE_STATS}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_STATS=1 + ) + endif() # TRITON_ENABLE_STATS + + if(${TRITON_ENABLE_TRACING}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_TRACING=1 + ) + endif() # TRITON_ENABLE_TRACING + + if(${TRITON_ENABLE_NVTX}) + target_compile_definitions( + http-endpoint-library + PRIVATE TRITON_ENABLE_NVTX=1 + ) + endif() # TRITON_ENABLE_NVTX + + if (WIN32) + find_library(B64_LIBRARY NAMES b64) + find_library(ZLIB_LIBRARY NAMES zlib) + target_link_libraries( + http-endpoint-library + PUBLIC + ${B64_LIBRARY} + ${ZLIB_LIBRARY} + ) + else() + target_link_libraries( + http-endpoint-library + PUBLIC + b64 + z + ) + endif() + + target_link_libraries( + main + PRIVATE + http-endpoint-library + ) +endif() # TRITON_ENABLE_HTTP || TRITON_ENABLE_METRICS || + # TRITON_ENABLE_SAGEMAKER || TRITON_ENABLE_VERTEX_AI + +# tracing +# +if(${TRITON_ENABLE_TRACING}) + message(STATUS "Using tracing ${TRITON_TRACE_INSTALL_PATH}") + + add_library( + tracing-library EXCLUDE_FROM_ALL + tracer.cc tracer.h + ) + + target_compile_features(tracing-library PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) + + if (NOT WIN32) + target_include_directories( + tracing-library + PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS} + ) + + target_link_libraries( + tracing-library + PRIVATE + ${OPENTELEMETRY_CPP_LIBRARIES}) + endif() + + target_link_libraries( + tracing-library + PUBLIC + triton-common-logging # from repo-common + triton-common-json # from repo-common + triton-core-serverapi # from repo-core + triton-core-serverstub # from repo-core + ) + + target_compile_definitions( + tracing-library + PRIVATE TRITON_ENABLE_TRACING=1 + ) + + if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + tracing-library + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) + + target_link_libraries( + tracing-library + PUBLIC + CUDA::cudart + ) + endif() # TRITON_ENABLE_GPU + + if(${TRITON_ENABLE_METRICS}) + target_compile_definitions( + tracing-library + PRIVATE TRITON_ENABLE_METRICS=1 + ) + endif() # TRITON_ENABLE_METRICS + + if(${TRITON_ENABLE_LOGGING}) + target_compile_definitions( + tracing-library + PRIVATE TRITON_ENABLE_LOGGING=1 + ) + endif() # TRITON_ENABLE_LOGGING + + if(${TRITON_ENABLE_STATS}) + target_compile_definitions( + tracing-library + PRIVATE TRITON_ENABLE_STATS=1 + ) + endif() # TRITON_ENABLE_STATS + + if(${TRITON_ENABLE_NVTX}) + target_compile_definitions( + tracing-library + PRIVATE TRITON_ENABLE_NVTX=1 + ) + endif() # TRITON_ENABLE_NVTX + + target_link_libraries( + main + PRIVATE + tracing-library + ) +endif() # TRITON_ENABLE_TRACING + +if (NOT WIN32) + # + # simple + # + add_executable( + simple + simple.cc + ) + + target_compile_features(simple PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + message("Using MSVC as compiler, default target on Windows 10. " + "If the target system is not Windows 10, please update _WIN32_WINNT " + "to corresponding value.") + target_compile_options( + simple + PRIVATE + /W1 /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor + ) + else() + target_compile_options( + simple + PRIVATE + -Wall -Wextra -Wno-type-limits -Wno-unused-parameter -Wno-deprecated-declarations -Werror + ) + endif() + + set_target_properties( + simple + PROPERTIES + POSITION_INDEPENDENT_CODE ON + SKIP_BUILD_RPATH TRUE + BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH FALSE + INSTALL_RPATH "" + ) + + target_link_libraries( + simple + PRIVATE + triton-common-async-work-queue # from repo-common + triton-common-error # from repo-common + triton-core-serverapi # from repo-core + triton-core-serverstub # from repo-core + ) + + if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + simple + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) + + target_link_libraries( + simple + PRIVATE + CUDA::cudart + ) + endif() # TRITON_ENABLE_GPU + + install( + TARGETS simple + RUNTIME DESTINATION bin + ) + + # + # multi_server example + # + add_executable( + multi_server + multi_server.cc + ) + + target_compile_features(multi_server PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + message("Using MSVC as compiler, default target on Windows 10. " + "If the target system is not Windows 10, please update _WIN32_WINNT " + "to corresponding value.") + target_compile_options( + multi_server + PRIVATE + /W1 /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor + ) + else() + target_compile_options( + multi_server + PRIVATE + -Wall -Wextra -Wno-type-limits -Wno-unused-parameter -Wno-deprecated-declarations -Werror + ) + endif() + + set_target_properties( + multi_server + PROPERTIES + POSITION_INDEPENDENT_CODE ON + SKIP_BUILD_RPATH TRUE + BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH FALSE + INSTALL_RPATH "" + ) + + target_link_libraries( + multi_server + PRIVATE + triton-common-async-work-queue # from repo-common + triton-common-error # from repo-common + triton-core-serverapi # from repo-core + triton-core-serverstub # from repo-core + ) + + if(${TRITON_ENABLE_GPU}) + target_compile_definitions( + multi_server + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) + + target_link_libraries( + multi_server + PRIVATE + CUDA::cudart + ) + endif() # TRITON_ENABLE_GPU + + install( + TARGETS multi_server + RUNTIME DESTINATION bin + ) + + if(${TRITON_ENABLE_GPU}) + # + # memory_alloc example + # + add_executable( + memory_alloc + memory_alloc.cc + ) + + target_compile_features(memory_alloc PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD}) + if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + message("Using MSVC as compiler, default target on Windows 10. " + "If the target system is not Windows 10, please update _WIN32_WINNT " + "to corresponding value.") + target_compile_options( + memory_alloc + PRIVATE + /W1 /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor + ) + else() + target_compile_options( + memory_alloc + PRIVATE + -Wall -Wextra -Wno-type-limits -Wno-unused-parameter -Wno-deprecated-declarations -Werror + ) + endif() + + set_target_properties( + memory_alloc + PROPERTIES + POSITION_INDEPENDENT_CODE ON + SKIP_BUILD_RPATH TRUE + BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH FALSE + INSTALL_RPATH "" + ) + + target_compile_definitions( + memory_alloc + PRIVATE TRITON_ENABLE_GPU=1 + PRIVATE TRITON_MIN_COMPUTE_CAPABILITY=${TRITON_MIN_COMPUTE_CAPABILITY} + ) + + target_link_libraries( + memory_alloc + PRIVATE + triton-common-async-work-queue # from repo-common + triton-common-error # from repo-common + triton-core-serverapi # from repo-core + triton-core-serverstub # from repo-core + CUDA::cudart + ) + + install( + TARGETS memory_alloc + RUNTIME DESTINATION bin + ) + endif() # TRITON_ENABLE_GPU +endif() # NOT WIN32 + +# DLIS-7292: Extend tritonfrontend to build for Windows +if (NOT WIN32) + # tritonfrontend python package + add_subdirectory(python) +endif (NOT WIN32) + +# Currently unit tests do not build for windows... +if ( NOT WIN32) + add_subdirectory(test test) +endif() # NOT WIN32 + diff --git a/src/classification.cc b/src/classification.cc new file mode 100644 index 0000000000..2d8cd26b9e --- /dev/null +++ b/src/classification.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "classification.h" + +#include +#include + +#include "common.h" + +namespace triton { namespace server { + +namespace { + +template +TRITONSERVER_Error* +AddClassResults( + TRITONSERVER_InferenceResponse* response, const uint32_t output_idx, + const char* base, const size_t element_cnt, const uint32_t req_class_cnt, + std::vector* class_strs) +{ + const T* probs = reinterpret_cast(base); + + std::vector idx(element_cnt); + iota(idx.begin(), idx.end(), 0); + sort(idx.begin(), idx.end(), [&probs](size_t i1, size_t i2) { + return probs[i1] > probs[i2]; + }); + + const size_t class_cnt = std::min(element_cnt, (size_t)req_class_cnt); + for (size_t k = 0; k < class_cnt; ++k) { + class_strs->push_back( + std::to_string(probs[idx[k]]) + ":" + std::to_string(idx[k])); + + const char* label; + RETURN_IF_ERR(TRITONSERVER_InferenceResponseOutputClassificationLabel( + response, output_idx, idx[k], &label)); + if (label != nullptr) { + class_strs->back() += ":"; + class_strs->back().append(label); + } + } + + return nullptr; // success +} + +} // namespace + + +TRITONSERVER_Error* +TopkClassifications( + TRITONSERVER_InferenceResponse* response, const uint32_t output_idx, + const char* base, const size_t byte_size, + const TRITONSERVER_DataType datatype, const uint32_t req_class_count, + std::vector* class_strs) +{ + const size_t element_cnt = + byte_size / TRITONSERVER_DataTypeByteSize(datatype); + + switch (datatype) { + case TRITONSERVER_TYPE_UINT8: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + case TRITONSERVER_TYPE_UINT16: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + case TRITONSERVER_TYPE_UINT32: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + case TRITONSERVER_TYPE_UINT64: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + + case TRITONSERVER_TYPE_INT8: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + case TRITONSERVER_TYPE_INT16: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + case TRITONSERVER_TYPE_INT32: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + case TRITONSERVER_TYPE_INT64: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + + case TRITONSERVER_TYPE_FP32: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + case TRITONSERVER_TYPE_FP64: + return AddClassResults( + response, output_idx, base, element_cnt, req_class_count, class_strs); + + default: + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INVALID_ARG, + std::string( + std::string("class result not available for output due to " + "unsupported type '") + + std::string(TRITONSERVER_DataTypeString(datatype)) + "'") + .c_str()); + } + + return nullptr; // success +} + +}} // namespace triton::server diff --git a/src/classification.h b/src/classification.h new file mode 100644 index 0000000000..9264baa2b0 --- /dev/null +++ b/src/classification.h @@ -0,0 +1,41 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#pragma once + +#include +#include + +#include "triton/core/tritonserver.h" + +namespace triton { namespace server { + +TRITONSERVER_Error* TopkClassifications( + TRITONSERVER_InferenceResponse* response, const uint32_t output_idx, + const char* base, const size_t byte_size, + const TRITONSERVER_DataType datatype, const uint32_t req_class_count, + std::vector* class_strs); + +}} // namespace triton::server diff --git a/src/clients/c++/BUILD b/src/clients/c++/BUILD deleted file mode 100644 index 2343d6ec6d..0000000000 --- a/src/clients/c++/BUILD +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -package( - default_visibility = ["//visibility:public"], -) - -cc_library( - name = "image_client_main", - srcs = ["image_client.cc"], - deps = [ - ":request", - "//src/core:model_config_proto", - ], -) - -cc_library( - name = "perf_client_main", - srcs = ["perf_client.cc"], - deps = [ - ":request", - "//src/core:constants", - "//src/core:model_config_proto", - ], -) - -cc_library( - name = "simple_client_main", - srcs = ["simple_client.cc"], - deps = [ - ":request", - "//src/core:model_config_proto", - ], -) - -cc_library( - name = "request", - srcs = ["request.cc"], - hdrs = ["request.h"], - deps = [ - "//src/core:api_proto", - "//src/core:constants", - "//src/core:grpc_service_proto", - "//src/core:model_config_proto", - "//src/core:model_config", - "//src/core:request_status_proto", - "//src/core:server_status_proto", - ], -) - -cc_binary( - name = "image_client", - deps = [ - ":image_client_main", - ":request", - ], - linkopts = [ - "-pthread", - "-lcurl", "-lz", - "-lopencv_core", "-lopencv_imgproc", "-lopencv_highgui" - ], -) - -cc_binary( - name = "perf_client", - deps = [ - ":perf_client_main", - ":request", - ], - linkopts = [ - "-pthread", "-lcurl" - ], -) - -cc_binary( - name = "simple_client", - deps = [ - ":simple_client_main", - ":request", - ], - linkopts = [ - "-pthread", "-lcurl" - ], -) diff --git a/src/clients/c++/image_client.cc b/src/clients/c++/image_client.cc deleted file mode 100644 index ca67063cf7..0000000000 --- a/src/clients/c++/image_client.cc +++ /dev/null @@ -1,688 +0,0 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "src/clients/c++/request.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "src/core/model_config.pb.h" - -namespace ni = nvidia::inferenceserver; -namespace nic = nvidia::inferenceserver::client; - -namespace { - -enum ScaleType { NONE = 0, VGG = 1, INCEPTION = 2 }; - -enum ProtocolType { HTTP = 0, GRPC = 1 }; - -void -Preprocess( - const cv::Mat& img, ni::ModelInput::Format format, int img_type1, - int img_type3, size_t img_channels, const cv::Size& img_size, - const ScaleType scale, std::vector* input_data) -{ - // Image channels are in BGR order. Currently model configuration - // data doesn't provide any information as to the expected channel - // orderings (like RGB, BGR). We are going to assume that RGB is the - // most likely ordering and so change the channels to that ordering. - - cv::Mat sample; - if ((img.channels() == 3) && (img_channels == 1)) { - cv::cvtColor(img, sample, CV_BGR2GRAY); - } else if ((img.channels() == 4) && (img_channels == 1)) { - cv::cvtColor(img, sample, CV_BGRA2GRAY); - } else if ((img.channels() == 3) && (img_channels == 3)) { - cv::cvtColor(img, sample, CV_BGR2RGB); - } else if ((img.channels() == 4) && (img_channels == 3)) { - cv::cvtColor(img, sample, CV_BGRA2RGB); - } else if ((img.channels() == 1) && (img_channels == 3)) { - cv::cvtColor(img, sample, CV_GRAY2RGB); - } else { - std::cerr << "unexpected number of channels in input image or model" - << std::endl; - exit(1); - } - - cv::Mat sample_resized; - if (sample.size() != img_size) { - cv::resize(sample, sample_resized, img_size); - } else { - sample_resized = sample; - } - - cv::Mat sample_type; - sample_resized.convertTo( - sample_type, (img_channels == 3) ? img_type3 : img_type1); - - cv::Mat sample_final; - if (scale == ScaleType::INCEPTION) { - if (img_channels == 1) { - sample_final = sample_type.mul(cv::Scalar(1 / 128.0)); - sample_final = sample_final - cv::Scalar(1.0); - } else { - sample_final = - sample_type.mul(cv::Scalar(1 / 128.0, 1 / 128.0, 1 / 128.0)); - sample_final = sample_final - cv::Scalar(1.0, 1.0, 1.0); - } - } else if (scale == ScaleType::VGG) { - if (img_channels == 1) { - sample_final = sample_type - cv::Scalar(128); - } else { - sample_final = sample_type - cv::Scalar(104, 117, 123); - } - } else { - sample_final = sample_type; - } - - // Allocate a buffer to hold all image elements. - size_t img_byte_size = sample_final.total() * sample_final.elemSize(); - size_t pos = 0; - input_data->resize(img_byte_size); - - // For NHWC format Mat is already in the correct order but need to - // handle both cases of data being contigious or not. - if (format == ni::ModelInput::FORMAT_NHWC) { - if (sample_final.isContinuous()) { - memcpy(&((*input_data)[0]), sample_final.datastart, img_byte_size); - pos = img_byte_size; - } else { - size_t row_byte_size = sample_final.cols * sample_final.elemSize(); - for (int r = 0; r < sample_final.rows; ++r) { - memcpy( - &((*input_data)[pos]), sample_final.ptr(r), row_byte_size); - pos += row_byte_size; - } - } - } else { - // (format == ni::ModelInput::FORMAT_NCHW) - // - // For CHW formats must split out each channel from the matrix and - // order them as BBBB...GGGG...RRRR. To do this split the channels - // of the image directly into 'input_data'. The BGR channels are - // backed by the 'input_data' vector so that ends up with CHW - // order of the data. - std::vector input_bgr_channels; - for (size_t i = 0; i < img_channels; ++i) { - input_bgr_channels.emplace_back( - img_size.height, img_size.width, img_type1, &((*input_data)[pos])); - pos += input_bgr_channels.back().total() * - input_bgr_channels.back().elemSize(); - } - - cv::split(sample_final, input_bgr_channels); - } - - if (pos != img_byte_size) { - std::cerr << "unexpected total size of channels " << pos << ", expecting " - << img_byte_size << std::endl; - exit(1); - } -} - -void -Postprocess( - const std::vector>& results, - const std::vector& filenames, const size_t batch_size) -{ - if (results.size() != 1) { - std::cerr << "expected 1 result, got " << results.size() << std::endl; - exit(1); - } - - const std::unique_ptr& result = results[0]; - - if (filenames.size() != batch_size) { - std::cerr << "expected " << batch_size << " filenames, got " - << filenames.size() << std::endl; - exit(1); - } - - for (size_t b = 0; b < batch_size; ++b) { - size_t cnt = 0; - nic::Error err = result->GetClassCount(b, &cnt); - if (!err.IsOk()) { - std::cerr << "failed reading class count for batch " << b << ": " << err - << std::endl; - exit(1); - } - - std::cout << "Image '" << filenames[b] << "':" << std::endl; - - for (size_t c = 0; c < cnt; ++c) { - nic::InferContext::Result::ClassResult cls; - nic::Error err = result->GetClassAtCursor(b, &cls); - if (!err.IsOk()) { - std::cerr << "failed reading class for batch " << b << ": " << err - << std::endl; - exit(1); - } - - std::cout << " " << cls.idx << " (" << cls.label << ") = " << cls.value - << std::endl; - } - } -} - -void -Usage(char** argv, const std::string& msg = std::string()) -{ - if (!msg.empty()) { - std::cerr << "error: " << msg << std::endl; - } - - std::cerr << "Usage: " << argv[0] - << " [options] " << std::endl; - std::cerr << " Note that image folder should only contain image files." - << std::endl; - std::cerr << "\t-v" << std::endl; - std::cerr << "\t-a" << std::endl; - std::cerr << "\t-b " << std::endl; - std::cerr << "\t-c " << std::endl; - std::cerr << "\t-s " << std::endl; - std::cerr << "\t-p " << std::endl; - std::cerr << "\t-m " << std::endl; - std::cerr << "\t-x " << std::endl; - std::cerr << "\t-u " << std::endl; - std::cerr << "\t-i " - << std::endl; - std::cerr << std::endl; - std::cerr << "If -a is specified then asynchronous client API will be used. " - << "Default is to use the synchronous API." << std::endl; - std::cerr - << "For -b, a single image will be replicated and sent in a batch" - << std::endl - << " of the specified size. A directory of images will be grouped" - << std::endl - << " into batches. Default is 1." << std::endl; - std::cerr << "For -c, the classes will be returned, default is 1." - << std::endl; - std::cerr << "For -s, specify the type of pre-processing scaling that" - << std::endl - << " should be performed on the image, default is NONE." - << std::endl - << " INCEPTION: scale each pixel RGB value to [-1.0, 1.0)." - << std::endl - << " VGG: subtract mean BGR value (104, 117, 123) from" - << std::endl - << " each pixel." << std::endl; - std::cerr - << "If -x is not specified the most recent version (that is, the highest " - << "numbered version) of the model will be used." << std::endl; - std::cerr << "For -p, it generates file only if image file is specified." - << std::endl; - std::cerr << "For -u, the default server URL is localhost:8000." << std::endl; - std::cerr << "For -i, available protocols are gRPC and HTTP. Default is HTTP." - << std::endl; - std::cerr << std::endl; - - exit(1); -} - -ScaleType -ParseScale(const std::string& str) -{ - if (str == "NONE") { - return ScaleType::NONE; - } else if (str == "INCEPTION") { - return ScaleType::INCEPTION; - } else if (str == "VGG") { - return ScaleType::VGG; - } - - std::cerr << "unexpected scale type \"" << str - << "\", expecting NONE, INCEPTION or VGG" << std::endl; - exit(1); - - return ScaleType::NONE; -} - -ProtocolType -ParseProtocol(const std::string& str) -{ - std::string protocol(str); - std::transform(protocol.begin(), protocol.end(), protocol.begin(), ::tolower); - if (protocol == "http") { - return ProtocolType::HTTP; - } else if (protocol == "grpc") { - return ProtocolType::GRPC; - } - - std::cerr << "unexpected protocol type \"" << str - << "\", expecting HTTP or gRPC" << std::endl; - exit(1); - - return ProtocolType::HTTP; -} - -bool -ParseType(const ni::DataType& dtype, int* type1, int* type3) -{ - if (dtype == ni::DataType::TYPE_UINT8) { - *type1 = CV_8UC1; - *type3 = CV_8UC3; - } else if (dtype == ni::DataType::TYPE_INT8) { - *type1 = CV_8SC1; - *type3 = CV_8SC3; - } else if (dtype == ni::DataType::TYPE_UINT16) { - *type1 = CV_16UC1; - *type3 = CV_16UC3; - } else if (dtype == ni::DataType::TYPE_INT16) { - *type1 = CV_16SC1; - *type3 = CV_16SC3; - } else if (dtype == ni::DataType::TYPE_INT32) { - *type1 = CV_32SC1; - *type3 = CV_32SC3; - } else if (dtype == ni::DataType::TYPE_FP32) { - *type1 = CV_32FC1; - *type3 = CV_32FC3; - } else if (dtype == ni::DataType::TYPE_FP64) { - *type1 = CV_64FC1; - *type3 = CV_64FC3; - } else { - return false; - } - - return true; -} - -void -ParseModel( - const std::unique_ptr& ctx, const size_t batch_size, - size_t* c, size_t* h, size_t* w, ni::ModelInput::Format* format, int* type1, - int* type3, bool verbose = false) -{ - if (ctx->Inputs().size() != 1) { - std::cerr << "expecting 1 input, model \"" << ctx->ModelName() << "\" has " - << ctx->Inputs().size() << std::endl; - exit(1); - } - - if (ctx->Outputs().size() != 1) { - std::cerr << "expecting 1 output, model \"" << ctx->ModelName() << "\" has " - << ctx->Outputs().size() << std::endl; - exit(1); - } - - const auto& input = ctx->Inputs()[0]; - const auto& output = ctx->Outputs()[0]; - - if (output->DType() != ni::DataType::TYPE_FP32) { - std::cerr << "expecting model output datatype to be TYPE_FP32, model \"" - << ctx->ModelName() << "\" output type is " - << ni::DataType_Name(output->DType()) << std::endl; - exit(1); - } - - // Output is expected to be a vector. But allow any number of - // dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10 - // }, { 10, 1, 1 } are all ok). - size_t non_one_cnt = 0; - for (const auto dim : output->Dims()) { - if (dim > 1) { - non_one_cnt++; - if (non_one_cnt > 1) { - std::cerr << "expecting model output to be a vector" << std::endl; - exit(1); - } - } - } - - *format = input->Format(); - - int max_batch_size = ctx->MaxBatchSize(); - - // Model specifying maximum batch size of 0 indicates that batching - // is not supported and so the input tensors do not expect a "N" - // dimension (and 'batch_size' should be 1 so that only a single - // image instance is inferred at a time). - if (max_batch_size == 0) { - if (batch_size != 1) { - std::cerr << "batching not supported for model \"" << ctx->ModelName() - << "\"" << std::endl; - exit(1); - } - } else { - // max_batch_size > 0 - if (batch_size > (size_t)max_batch_size) { - std::cerr << "expecting batch size <= " << max_batch_size - << " for model \"" << ctx->ModelName() << "\"" << std::endl; - exit(1); - } - } - - if (input->Dims().size() != 3) { - std::cerr << "expecting model input to have 3 dimensions, model \"" - << ctx->ModelName() << "\" input has " << input->Dims().size() - << std::endl; - exit(1); - } - - // Input must be NHWC or NCHW... - if ( - (*format != ni::ModelInput::FORMAT_NCHW) && - (*format != ni::ModelInput::FORMAT_NHWC)) { - std::cerr << "unexpected input format " - << ni::ModelInput_Format_Name(*format) << ", expecting " - << ni::ModelInput_Format_Name(ni::ModelInput::FORMAT_NHWC) - << " or " - << ni::ModelInput_Format_Name(ni::ModelInput::FORMAT_NCHW) - << std::endl; - exit(1); - } - - if (*format == ni::ModelInput::FORMAT_NHWC) { - *h = input->Dims()[0]; - *w = input->Dims()[1]; - *c = input->Dims()[2]; - } else if (*format == ni::ModelInput::FORMAT_NCHW) { - *c = input->Dims()[0]; - *h = input->Dims()[1]; - *w = input->Dims()[2]; - } - - if (!ParseType(input->DType(), type1, type3)) { - std::cerr << "unexpected input datatype \"" - << ni::DataType_Name(input->DType()) << "\" for model \"" - << ctx->ModelName() << std::endl; - exit(1); - } -} - -void -FileToInputData( - const std::string& filename, size_t c, size_t h, size_t w, - ni::ModelInput::Format format, int type1, int type3, ScaleType scale, - std::vector* input_data) -{ - // Load the specified image. - std::ifstream file(filename); - std::vector data; - file >> std::noskipws; - std::copy( - std::istream_iterator(file), std::istream_iterator(), - std::back_inserter(data)); - if (data.empty()) { - std::cerr << "error: unable to read image file " << filename << std::endl; - exit(1); - } - - cv::Mat img = imdecode(cv::Mat(data), 1); - if (img.empty()) { - std::cerr << "error: unable to decode image " << filename << std::endl; - exit(1); - } - - // Pre-process the image to match input size expected by the model. - Preprocess(img, format, type1, type3, c, cv::Size(w, h), scale, input_data); -} - -} // namespace - -int -main(int argc, char** argv) -{ - bool verbose = false; - bool async = false; - size_t batch_size = 1; - size_t topk = 1; - ScaleType scale = ScaleType::NONE; - std::string preprocess_output_filename; - std::string model_name; - int model_version = -1; - std::string url("localhost:8000"); - ProtocolType protocol = ProtocolType::HTTP; - - // Parse commandline... - int opt; - while ((opt = getopt(argc, argv, "vau:m:x:b:c:s:p:i:")) != -1) { - switch (opt) { - case 'v': - verbose = true; - break; - case 'a': - async = true; - break; - case 'u': - url = optarg; - break; - case 'm': - model_name = optarg; - break; - case 'x': - model_version = atoi(optarg); - break; - case 'b': - batch_size = atoi(optarg); - break; - case 'c': - topk = atoi(optarg); - break; - case 's': - scale = ParseScale(optarg); - break; - case 'p': - preprocess_output_filename = optarg; - break; - case 'i': - protocol = ParseProtocol(optarg); - break; - case '?': - Usage(argv); - break; - } - } - - if (model_name.empty()) { - Usage(argv, "-m flag must be specified"); - } - if (batch_size <= 0) { - Usage(argv, "batch size must be > 0"); - } - if (topk <= 0) { - Usage(argv, "topk must be > 0"); - } - if (optind >= argc) { - Usage(argv, "image file or image folder must be specified"); - } - - // Create the context for inference of the specified model. From it - // extract and validate that the model meets the requirements for - // image classification. - std::unique_ptr ctx; - nic::Error err; - if (protocol == ProtocolType::HTTP) { - err = nic::InferHttpContext::Create( - &ctx, url, model_name, model_version, verbose); - } else { - err = nic::InferGrpcContext::Create( - &ctx, url, model_name, model_version, verbose); - } - if (!err.IsOk()) { - std::cerr << "error: unable to create inference context: " << err - << std::endl; - exit(1); - } - - size_t c, h, w; - ni::ModelInput::Format format; - int type1, type3; - ParseModel(ctx, batch_size, &c, &h, &w, &format, &type1, &type3, verbose); - - // - - // Collect the names of the image(s). - std::vector image_filenames; - - struct stat name_stat; - if (stat(argv[optind], &name_stat) != 0) { - std::cerr << "Failed to find '" << std::string(argv[optind]) - << "': " << strerror(errno) << std::endl; - exit(1); - } - - if (name_stat.st_mode & S_IFDIR) { - const std::string dirname = argv[optind]; - DIR* dir_ptr = opendir(dirname.c_str()); - struct dirent* d_ptr; - while ((d_ptr = readdir(dir_ptr)) != NULL) { - const std::string filename = d_ptr->d_name; - if ((filename != ".") && (filename != "..")) { - image_filenames.push_back(dirname + "/" + filename); - } - } - closedir(dir_ptr); - } else { - image_filenames.push_back(argv[optind]); - } - - // Sort the filenames so that we always visit them in the same order - // (readdir does not guarantee any particular order). - std::sort(image_filenames.begin(), image_filenames.end()); - - // Preprocess the images into input data according to model - // requirements - std::vector> image_data; - for (const auto& fn : image_filenames) { - image_data.emplace_back(); - FileToInputData( - fn, c, h, w, format, type1, type3, scale, &(image_data.back())); - - if ((image_data.size() == 1) && !preprocess_output_filename.empty()) { - std::ofstream output_file(preprocess_output_filename); - std::ostream_iterator output_iterator(output_file); - std::copy(image_data[0].begin(), image_data[0].end(), output_iterator); - } - } - - // Configure context for 'batch_size' and 'topk' - std::unique_ptr options; - err = nic::InferContext::Options::Create(&options); - if (!err.IsOk()) { - std::cerr << "failed initializing infer options: " << err << std::endl; - exit(1); - } - - options->SetBatchSize(batch_size); - options->AddClassResult(ctx->Outputs()[0], topk); - err = ctx->SetRunOptions(*options); - if (!err.IsOk()) { - std::cerr << "failed initializing batch size: " << err << std::endl; - exit(1); - } - - // Send requests of 'batch_size' images. If the number of images - // isn't an exact multiple of 'batch_size' then just start over with - // the first images until the batch is filled. - // - // Number of requests sent = ceil(number of images / batch_size) - std::vector>> results; - std::vector> result_filenames; - std::vector> requests; - size_t image_idx = 0; - bool last_request = false; - while (!last_request) { - // Already verified that there is 1 input... - const auto& input = ctx->Inputs()[0]; - - // Reset the input for new request. - err = input->Reset(); - if (!err.IsOk()) { - std::cerr << "failed resetting input: " << err << std::endl; - exit(1); - } - - // Set input to be the next 'batch_size' images (preprocessed). - std::vector input_filenames; - for (size_t idx = 0; idx < batch_size; ++idx) { - input_filenames.push_back(image_filenames[image_idx]); - err = input->SetRaw(image_data[image_idx]); - if (!err.IsOk()) { - std::cerr << "failed setting input: " << err << std::endl; - exit(1); - } - - image_idx = (image_idx + 1) % image_data.size(); - if (image_idx == 0) { - last_request = true; - } - } - - result_filenames.emplace_back(std::move(input_filenames)); - - // Send request. - if (!async) { - results.emplace_back(); - err = ctx->Run(&(results.back())); - if (!err.IsOk()) { - std::cerr << "failed sending synchronous infer request: " << err - << std::endl; - exit(1); - } - } else { - std::shared_ptr req; - err = ctx->AsyncRun(&req); - if (!err.IsOk()) { - std::cerr << "failed sending asynchronous infer request: " << err - << std::endl; - exit(1); - } - - requests.emplace_back(std::move(req)); - } - } - - // For async, retrieve results according to the send order - if (async) { - for (auto& request : requests) { - results.emplace_back(); - err = ctx->GetAsyncRunResults(&(results.back()), request, true); - if (!err.IsOk()) { - std::cerr << "failed receiving infer response: " << err << std::endl; - exit(1); - } - } - } - - // Post-process the results to make prediction(s) - for (size_t idx = 0; idx < results.size(); idx++) { - std::cout << "Request " << idx << ", batch size " << batch_size - << std::endl; - Postprocess(results[idx], result_filenames[idx], batch_size); - } - - return 0; -} diff --git a/src/clients/c++/perf_client.cc b/src/clients/c++/perf_client.cc deleted file mode 100644 index 2c9067c82b..0000000000 --- a/src/clients/c++/perf_client.cc +++ /dev/null @@ -1,1377 +0,0 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "src/clients/c++/request.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "src/core/constants.h" - -namespace ni = nvidia::inferenceserver; -namespace nic = nvidia::inferenceserver::client; - -//============================================================================== -// Perf Client -// -// Perf client provides various metrics to measure the performance of -// the inference server. It can either be used to measure the throughput, -// latency and time distribution under specific setting (i.e. fixed batch size -// and fixed concurrent requests), or be used to generate throughput-latency -// data point under dynamic setting (i.e. collecting throughput-latency data -// under different load level). -// -// The following data is collected and used as part of the metrics: -// - Throughput (infer/sec): -// The number of inference processed per second as seen by the client. -// The number of inference is measured by the multiplication of the number -// of requests and their batch size. And the total time is the time elapsed -// from when the client starts sending requests to when the client received -// all responses. -// - Latency (usec): -// The average elapsed time between when a request is sent and -// when the response for the request is received. -// -// There are two settings (see -d option) for the data collection: -// - Fixed concurrent request mode: -// In this setting, the client will maintain a fixed number of concurrent -// requests sent to the server (see -t option). See ConcurrencyManager for -// more detail. The number of requests will be the total number of requests -// sent within the time interval for measurement (see -p option) and -// the latency will be the average latency across all requests. -// -// Besides throughput and latency, which is measured in client side, -// the following data measured by the server will also be reported -// in this setting: -// - Concurrent request: the number of concurrent requests as specified -// in -t option -// - Batch size: the batch size of each request as specified in -b option -// - Inference count: batch size * number of inference requests -// - Cumulative time: the total time between request received and -// response sent on the requests sent by perf client. -// - Average Cumulative time: cumulative time / number of inference requests -// - Compute time: the total time it takes to run inferencing including time -// copying input tensors to GPU memory, time executing the model, -// and time copying output tensors from GPU memory for the requests -// sent by perf client. -// - Average compute time: compute time / number of inference requests -// - Queue time: the total time it takes to wait for an available model -// instance for the requests sent by perf client. -// - Average queue time: queue time / number of inference requests -// -// - Dynamic concurrent request mode: -// In this setting, the client will perform the following procedure: -// 1. Follows the procedure in fixed concurrent request mode using -// k concurrent requests (k starts at 1). -// 2. Gathers data reported from step 1. -// 3. Increases k by 1 and repeats step 1 and 2 until latency from current -// iteration exceeds latency threshold (see -l option) -// At each iteration, the data mentioned in fixed concurrent request mode -// will be reported. Besides that, after the procedure above, a collection -// of "throughput, latency, concurrent request count" tuples will be -// reported in increasing load level order. -// -// Options: -// -b: batch size for each request sent. -// -t: number of concurrent requests sent. If -d is set, -t indicate the number -// of concurrent requests to start with ("starting concurrency" level). -// -d: enable dynamic concurrent request mode. -// -l: latency threshold in msec, will have no effect if -d is not set. -// -p: time interval for each measurement window in msec. -// -// For detail of the options not listed, please refer to the usage. -// - -namespace { - -volatile bool early_exit = false; - -void -SignalHandler(int signum) -{ - std::cout << "Interrupt signal (" << signum << ") received." << std::endl - << "Waiting for in-flight inferences to complete." << std::endl; - - early_exit = true; -} - -typedef struct PerformanceStatusStruct { - uint32_t concurrency; - size_t batch_size; - // Request count and elapsed time measured by server - uint64_t server_request_count; - uint64_t server_cumm_time_ns; - uint64_t server_queue_time_ns; - uint64_t server_compute_time_ns; - - // Request count and elapsed time measured by client - uint64_t client_request_count; - uint64_t client_duration_ns; - uint64_t client_min_latency_ns; - uint64_t client_max_latency_ns; - uint64_t client_avg_latency_ns; - // Using usec to avoid square of large number (large in nsec) - uint64_t std_us; - uint64_t client_avg_request_time_ns; - uint64_t client_avg_send_time_ns; - uint64_t client_avg_receive_time_ns; - // Per infer stat - int client_infer_per_sec; -} PerfStatus; - - -enum ProtocolType { HTTP = 0, GRPC = 1 }; - -//============================================================================== -// Concurrency Manager -// -// An instance of concurrency manager will be created at the beginning of the -// perf client and it will be used to simulate different load level in respect -// to number of concurrent infer requests and to report the performance status. -// After the creation, perf client obtains performance status under the setting -// specified in command line options by calling Step() function. -// -// (Tentative usage) -// std::unique_ptr manager; -// ConcurrencyManager::Create(&manager, ...); -// if (fixed_mode) { -// PerfStatus status_summary; -// manager->Step(status_summary, concurrent_request_count); -// Report(status_summary, ...); -// } else { -// PerfStatus status_summary; -// for (count = 1;;count++) { -// manager->Step(status_summary, count); -// Report(status_summary, ...); -// if (status_summary.avg_latency_us >= latency_threshold) -// break; -// } -// } -// -// Detail: -// Concurrency Manager will maintain the number of concurrent requests by using -// corresponding number of worker threads that keep sending randomly generated -// requests to the server. The worker threads will record the start time and end -// time of each request into a shared vector. -// -// The manager can adjust the number of concurrent requests by creating -// new threads or by pausing existing threads (by pause_index_). -// After the adjustment, the manager will actively measure the throughput until -// it is stable. Once stable, the manager update the 'status_summary' based on -// the most recent measurement. -// -// The measurement procedure: -// 1. Main thread gets start status from the server and records the start time. -// 2. After given time interval, main thread gets end status from the server and -// records the end time. -// 3. From the shared vector, Main thread uses data that are generated between -// recorded start time and end time to measure client side status and -// update status_summary. - -class ConcurrencyManager { - public: - ~ConcurrencyManager() - { - early_exit = true; - // wake up all threads - { - // Acquire lock first to make sure no worker thread is trying to pause - // (avoid dead lock) - std::lock_guard lk(wake_mutex_); - *pause_index_ = threads_.size(); - } - wake_signal_.notify_all(); - - size_t cnt = 0; - for (auto& thread : threads_) { - thread.join(); - if (!threads_status_[cnt]->IsOk()) { - std::cerr << "Thread [" << cnt - << "] had error: " << *(threads_status_[cnt]) << std::endl; - } - cnt++; - } - } - - static nic::Error Create( - std::unique_ptr* manager, const bool verbose, - const bool profile, const int32_t batch_size, const double stable_offset, - const uint64_t measurement_window_ms, const size_t max_measurement_count, - const bool async, const std::string& model_name, const int model_version, - const std::string& url, const ProtocolType protocol) - { - manager->reset(new ConcurrencyManager( - verbose, profile, batch_size, stable_offset, measurement_window_ms, - max_measurement_count, async, model_name, model_version, url, protocol)); - (*manager)->pause_index_.reset(new size_t(0)); - (*manager)->request_timestamps_.reset(new TimestampVector()); - return nic::Error(ni::RequestStatusCode::SUCCESS); - } - - // Step will adjust the number of concurrent requests to be the same as - // 'concurrent_request_count' (by creating threads or by pausing threads) - // and it will actively measure throughput in every 'measurement_window' msec - // until the throughput is stable. Once the throughput is stable, it summarize - // the most recent measurement into 'status_summary' - // NOTE: the requests are being sent regardless of the measurement, so the - // data returned by the server (see struct PerforamnceStatusStruct) will - // include more requests than what the client measures (we can't get the exact - // server status right before the first request and right after the last - // request). - nic::Error Step( - PerfStatus& status_summary, const size_t concurrent_request_count) - { - status_summary.concurrency = concurrent_request_count; - - // Adjust concurrency level - { - // Acquire lock first to make sure no worker thread is trying to pause - // (avoid dead lock) - std::lock_guard lk(wake_mutex_); - *pause_index_ = concurrent_request_count; - } - wake_signal_.notify_all(); - - // Create new threads if we can not provide concurrency needed - if (!async_) { - while (concurrent_request_count > threads_.size()) { - // Launch new thread for inferencing - threads_status_.emplace_back( - new nic::Error(ni::RequestStatusCode::SUCCESS)); - threads_context_stat_.emplace_back(new nic::InferContext::Stat()); - size_t new_thread_index = threads_.size(); - threads_.emplace_back( - &ConcurrencyManager::Infer, this, threads_status_.back(), - threads_context_stat_.back(), request_timestamps_, pause_index_, - new_thread_index); - } - } else { - // TODO: check how much extra latency async infer introduces. - // One worker thread still need to prepare the requests - // in sequence, intuitively, it seems like the concurrency level - // may not be as stable as using multiple worker threads. Maybe having - // multiple worker threads and each handles some number of requests? - - // One worker thread is sufficient for async mode - if (threads_.size() == 0) { - // Launch new thread for inferencing - threads_status_.emplace_back( - new nic::Error(ni::RequestStatusCode::SUCCESS)); - threads_context_stat_.emplace_back(new nic::InferContext::Stat()); - threads_.emplace_back( - &ConcurrencyManager::AsyncInfer, this, threads_status_.back(), - threads_context_stat_.back(), request_timestamps_, pause_index_); - } - } - - - std::cout << "Request concurrency: " << concurrent_request_count - << std::endl; - - // Start measurement - nic::Error err(ni::RequestStatusCode::SUCCESS); - - size_t recent_k = 3; - std::vector infer_per_sec; - std::vector latencies; - // Stable will only be changed if max_measurement_count >= recent_k - bool stable = true; - double avg_ips = 0; - uint64_t avg_latency = 0; - do { - // Check thread status to make sure that the actual concurrency level is - // consistent to the one being reported - // If some thread return early, main thread will return and - // the worker thread's error message will be reported - // when ConcurrencyManager's destructor get called. - for (auto& thread_status : threads_status_) { - if (!thread_status->IsOk()) { - return nic::Error( - ni::RequestStatusCode::INTERNAL, - "Failed to maintain concurrency level requested." - " Worker thread(s) failed to generate concurrent requests."); - } - } - - err = Measure(status_summary); - if (!err.IsOk()) { - return err; - } - infer_per_sec.push_back(status_summary.client_infer_per_sec); - latencies.push_back(status_summary.client_avg_latency_ns); - avg_ips += (double)infer_per_sec.back() / recent_k; - avg_latency += latencies.back() / recent_k; - - if (verbose_) { - std::cout << " Pass [" << infer_per_sec.size() - << "] throughput: " << infer_per_sec.back() << " infer/sec. " - << "Avg latency: " - << (status_summary.client_avg_latency_ns / 1000) - << " usec (std " << status_summary.std_us << " usec)" - << std::endl; - } - - if (infer_per_sec.size() >= recent_k) { - size_t idx = infer_per_sec.size() - recent_k; - if (infer_per_sec.size() > recent_k) { - avg_ips -= (double)infer_per_sec[idx - 1] / recent_k; - avg_latency -= latencies[idx - 1] / recent_k; - } - stable = true; - for (; idx < infer_per_sec.size(); idx++) { - // We call it stable only if recent_k measurement are within - // +/-(stable_offset_)% of the average infer per second and latency - if ( - (infer_per_sec[idx] < avg_ips * (1 - stable_offset_)) || - (infer_per_sec[idx] > avg_ips * (1 + stable_offset_))) { - stable = false; - break; - } - if ( - (latencies[idx] < avg_latency * (1 - stable_offset_)) || - (latencies[idx] > avg_latency * (1 + stable_offset_))) { - stable = false; - break; - } - } - if (stable) { - break; - } - } - } while ((!early_exit) && (infer_per_sec.size() < max_measurement_count_)); - if (early_exit) { - return nic::Error( - ni::RequestStatusCode::INTERNAL, "Received exit signal."); - } else if (!stable) { - std::cerr << "Failed to obtain stable measurement within " - << max_measurement_count_ - << " measurement windows for concurrency " - << concurrent_request_count << ". Please try to " - << "increase the time window." << std::endl; - } - - return err; - } - - private: - using TimestampVector = - std::vector>; - - ConcurrencyManager( - const bool verbose, const bool profile, const int32_t batch_size, - const double stable_offset, const int32_t measurement_window_ms, - const size_t max_measurement_count, const bool async, - const std::string& model_name, const int model_version, - const std::string& url, const ProtocolType protocol) - : verbose_(verbose), profile_(profile), batch_size_(batch_size), - stable_offset_(stable_offset), - measurement_window_ms_(measurement_window_ms), - max_measurement_count_(max_measurement_count), async_(async), - model_name_(model_name), model_version_(model_version), url_(url), - protocol_(protocol) - { - } - - nic::Error StartProfile() - { - std::unique_ptr ctx; - nic::Error err; - if (protocol_ == ProtocolType::HTTP) { - err = nic::ProfileHttpContext::Create(&ctx, url_, false); - } else { - err = nic::ProfileGrpcContext::Create(&ctx, url_, false); - } - if (!err.IsOk()) { - return err; - } - - return ctx->StartProfile(); - } - - nic::Error StopProfile() - { - std::unique_ptr ctx; - nic::Error err; - if (protocol_ == ProtocolType::HTTP) { - err = nic::ProfileHttpContext::Create(&ctx, url_, false); - } else { - err = nic::ProfileGrpcContext::Create(&ctx, url_, false); - } - if (!err.IsOk()) { - return err; - } - - return ctx->StopProfile(); - } - - nic::Error GetModelStatus(ni::ModelStatus* model_status) - { - std::unique_ptr ctx; - nic::Error err; - if (protocol_ == ProtocolType::HTTP) { - err = - nic::ServerStatusHttpContext::Create(&ctx, url_, model_name_, false); - } else { - err = - nic::ServerStatusGrpcContext::Create(&ctx, url_, model_name_, false); - } - if (err.IsOk()) { - ni::ServerStatus server_status; - err = ctx->GetServerStatus(&server_status); - if (err.IsOk()) { - const auto& itr = server_status.model_status().find(model_name_); - if (itr == server_status.model_status().end()) { - err = nic::Error( - ni::RequestStatusCode::INTERNAL, "unable to find status for model"); - } else { - model_status->CopyFrom(itr->second); - } - } - } - - return err; - } - - nic::Error GetAccumulatedContextStat(nic::InferContext::Stat* contexts_stat) - { - std::lock_guard lk(status_report_mutex_); - for (auto& context_stat : threads_context_stat_) { - contexts_stat->completed_request_count += - context_stat->completed_request_count; - contexts_stat->cumulative_total_request_time_ns += - context_stat->cumulative_total_request_time_ns; - contexts_stat->cumulative_send_time_ns += - context_stat->cumulative_send_time_ns; - contexts_stat->cumulative_receive_time_ns += - context_stat->cumulative_receive_time_ns; - } - return nic::Error::Success; - } - - nic::Error Summarize( - PerfStatus& summary, const ni::ModelStatus& start_status, - const ni::ModelStatus& end_status, - const nic::InferContext::Stat& start_stat, - const nic::InferContext::Stat& end_stat) - { - nic::Error err(ni::RequestStatusCode::SUCCESS); - - //=============== - // Summarizing statistic measured by client - - // Get the requests in the shared vector - TimestampVector current_timestamps; - status_report_mutex_.lock(); - request_timestamps_->swap(current_timestamps); - status_report_mutex_.unlock(); - - // finding the start time of the first request - // and the end time of the last request in the timestamp queue - uint64_t first_request_start_ns = 0; - uint64_t last_request_end_ns = 0; - for (auto& timestamp : current_timestamps) { - uint64_t request_start_time = - timestamp.first.tv_sec * ni::NANOS_PER_SECOND + timestamp.first.tv_nsec; - uint64_t request_end_time = - timestamp.second.tv_sec * ni::NANOS_PER_SECOND + - timestamp.second.tv_nsec; - if ( - (first_request_start_ns > request_start_time) || - (first_request_start_ns == 0)) { - first_request_start_ns = request_start_time; - } - if ( - (last_request_end_ns < request_end_time) || - (last_request_end_ns == 0)) { - last_request_end_ns = request_end_time; - } - } - - // Define the measurement window [client_start_ns, client_end_ns) to be - // in the middle of the queue - uint64_t measurement_window_ns = measurement_window_ms_ * 1000 * 1000; - uint64_t offset = first_request_start_ns + measurement_window_ns; - offset = - (offset > last_request_end_ns) ? 0 : (last_request_end_ns - offset) / 2; - - uint64_t client_start_ns = first_request_start_ns + offset; - uint64_t client_end_ns = client_start_ns + measurement_window_ns; - uint64_t client_duration_ns = client_end_ns - client_start_ns; - - // Get measurement from requests that fall within the time interval - size_t valid_timestamp_count = 0; - uint64_t min_latency_ns = 0; - uint64_t max_latency_ns = 0; - uint64_t tol_latency_ns = 0; - uint64_t tol_square_latency_us = 0; - for (auto& timestamp : current_timestamps) { - uint64_t request_start_ns = - timestamp.first.tv_sec * ni::NANOS_PER_SECOND + timestamp.first.tv_nsec; - uint64_t request_end_ns = timestamp.second.tv_sec * ni::NANOS_PER_SECOND + - timestamp.second.tv_nsec; - - if (request_start_ns <= request_end_ns) { - // Only counting requests that end within the time interval - if ( - (request_end_ns >= client_start_ns) && - (request_end_ns <= client_end_ns)) { - uint64_t request_latency = request_end_ns - request_start_ns; - if ((request_latency < min_latency_ns) || (min_latency_ns == 0)) - min_latency_ns = request_latency; - if ((request_latency > max_latency_ns) || (max_latency_ns == 0)) - max_latency_ns = request_latency; - tol_latency_ns += request_latency; - tol_square_latency_us += - (request_latency * request_latency) / (1000 * 1000); - valid_timestamp_count++; - } - } - } - - if (valid_timestamp_count == 0) { - return nic::Error( - ni::RequestStatusCode::INTERNAL, - "No valid requests recorded within time interval." - " Please use a larger time window."); - } - - summary.batch_size = batch_size_; - summary.client_request_count = valid_timestamp_count; - summary.client_duration_ns = client_duration_ns; - float client_duration_sec = - (float)summary.client_duration_ns / ni::NANOS_PER_SECOND; - summary.client_infer_per_sec = - (int)(valid_timestamp_count * summary.batch_size / client_duration_sec); - summary.client_min_latency_ns = min_latency_ns; - summary.client_max_latency_ns = max_latency_ns; - summary.client_avg_latency_ns = tol_latency_ns / valid_timestamp_count; - - // calculate standard deviation - uint64_t expected_square_latency_us = - tol_square_latency_us / valid_timestamp_count; - uint64_t square_avg_latency_us = - (summary.client_avg_latency_ns * summary.client_avg_latency_ns) / - (1000 * 1000); - uint64_t var_us = (expected_square_latency_us > square_avg_latency_us) - ? (expected_square_latency_us - square_avg_latency_us) - : 0; - summary.std_us = (uint64_t)(sqrt(var_us)); - - size_t completed_count = - end_stat.completed_request_count - start_stat.completed_request_count; - uint64_t request_time_ns = end_stat.cumulative_total_request_time_ns - - start_stat.cumulative_total_request_time_ns; - uint64_t send_time_ns = - end_stat.cumulative_send_time_ns - start_stat.cumulative_send_time_ns; - uint64_t receive_time_ns = end_stat.cumulative_receive_time_ns - - start_stat.cumulative_receive_time_ns; - if (completed_count != 0) { - summary.client_avg_request_time_ns = request_time_ns / completed_count; - summary.client_avg_send_time_ns = send_time_ns / completed_count; - summary.client_avg_receive_time_ns = receive_time_ns / completed_count; - } - - //=============== - // Summarizing statistic measured by client - - // If model_version is -1 then look in the end status to find the - // latest (highest valued version) and use that as the version. - uint32_t status_model_version = 0; - if (model_version_ < 0) { - for (const auto& vp : end_status.version_status()) { - status_model_version = std::max(status_model_version, vp.first); - } - } else { - status_model_version = model_version_; - } - - const auto& vend_itr = - end_status.version_status().find(status_model_version); - if (vend_itr == end_status.version_status().end()) { - err = nic::Error( - ni::RequestStatusCode::INTERNAL, "missing model version status"); - } else { - const auto& end_itr = vend_itr->second.infer_stats().find(batch_size_); - if (end_itr == vend_itr->second.infer_stats().end()) { - err = nic::Error( - ni::RequestStatusCode::INTERNAL, "missing inference stats"); - } else { - uint64_t start_cnt = 0; - uint64_t start_cumm_time_ns = 0; - uint64_t start_queue_time_ns = 0; - uint64_t start_compute_time_ns = 0; - - const auto& vstart_itr = - start_status.version_status().find(status_model_version); - if (vstart_itr != start_status.version_status().end()) { - const auto& start_itr = - vstart_itr->second.infer_stats().find(batch_size_); - if (start_itr != vstart_itr->second.infer_stats().end()) { - start_cnt = start_itr->second.success().count(); - start_cumm_time_ns = start_itr->second.success().total_time_ns(); - start_queue_time_ns = start_itr->second.queue().total_time_ns(); - start_compute_time_ns = start_itr->second.compute().total_time_ns(); - } - } - - summary.server_request_count = - end_itr->second.success().count() - start_cnt; - summary.server_cumm_time_ns = - end_itr->second.success().total_time_ns() - start_cumm_time_ns; - summary.server_queue_time_ns = - end_itr->second.queue().total_time_ns() - start_queue_time_ns; - summary.server_compute_time_ns = - end_itr->second.compute().total_time_ns() - start_compute_time_ns; - } - } - return err; - } - - // Function for worker threads - void Infer( - std::shared_ptr err, - std::shared_ptr stat, - std::shared_ptr timestamp, - std::shared_ptr pause_index, const size_t thread_index) - { - // Create the context for inference of the specified model. - std::unique_ptr ctx; - if (protocol_ == ProtocolType::HTTP) { - *err = nic::InferHttpContext::Create( - &ctx, url_, model_name_, model_version_, false); - } else { - *err = nic::InferGrpcContext::Create( - &ctx, url_, model_name_, model_version_, false); - } - if (!err->IsOk()) { - return; - } - - if (batch_size_ > ctx->MaxBatchSize()) { - *err = nic::Error( - ni::RequestStatusCode::INVALID_ARG, - "expecting batch size <= " + std::to_string(ctx->MaxBatchSize()) + - " for model '" + ctx->ModelName() + "'"); - return; - } - - // Prepare context for 'batch_size' batches. Request that all - // outputs be returned. - std::unique_ptr options; - *err = nic::InferContext::Options::Create(&options); - if (!err->IsOk()) { - return; - } - - options->SetBatchSize(batch_size_); - for (const auto& output : ctx->Outputs()) { - options->AddRawResult(output); - } - - *err = ctx->SetRunOptions(*options); - if (!err->IsOk()) { - return; - } - - // Create a randomly initialized buffer that is large enough to - // provide the largest needed input. We (re)use this buffer for all - // input values. - size_t max_input_byte_size = 0; - for (const auto& input : ctx->Inputs()) { - max_input_byte_size = std::max(max_input_byte_size, input->ByteSize()); - } - - std::vector input_buf(max_input_byte_size); - for (size_t i = 0; i < input_buf.size(); ++i) { - input_buf[i] = rand(); - } - - // Initialize inputs to use random values... - for (const auto& input : ctx->Inputs()) { - *err = input->Reset(); - if (!err->IsOk()) { - return; - } - - for (size_t i = 0; i < batch_size_; ++i) { - *err = input->SetRaw(&input_buf[0], input->ByteSize()); - if (!err->IsOk()) { - return; - } - } - } - - // run inferencing until receiving exit signal to maintain server load. - do { - // Run inference to get output - std::vector> results; - - // Record the start time of the request - struct timespec start_time; - clock_gettime(CLOCK_MONOTONIC, &start_time); - - *err = ctx->Run(&results); - - // Record the end time of the request - struct timespec end_time; - clock_gettime(CLOCK_MONOTONIC, &end_time); - - if (!err->IsOk()) { - return; - } - - // Add the request timestamp to shared vector with proper locking - status_report_mutex_.lock(); - // Critical section - request_timestamps_->emplace_back(std::make_pair(start_time, end_time)); - // Update its InferContext statistic to shared Stat pointer - ctx->GetStat(stat.get()); - status_report_mutex_.unlock(); - - // Wait if the thread should be paused - if (thread_index >= *pause_index) { - // Using conditional variable to be able to wake up pausing threads - std::unique_lock lk(wake_mutex_); - wake_signal_.wait(lk, [thread_index, pause_index] { - return (thread_index < *pause_index); - }); - lk.unlock(); - } - // Stop inferencing if an early exit has been signaled. - } while (!early_exit); - } - - // Function for worker threads - void AsyncInfer( - std::shared_ptr err, - std::shared_ptr stat, - std::shared_ptr timestamp, - std::shared_ptr pause_index) - { - // Create the context for inference of the specified model. - std::unique_ptr ctx; - if (protocol_ == ProtocolType::HTTP) { - *err = nic::InferHttpContext::Create( - &ctx, url_, model_name_, model_version_, false); - } else { - *err = nic::InferGrpcContext::Create( - &ctx, url_, model_name_, model_version_, false); - } - if (!err->IsOk()) { - return; - } - - if (batch_size_ > ctx->MaxBatchSize()) { - *err = nic::Error( - ni::RequestStatusCode::INVALID_ARG, - "expecting batch size <= " + std::to_string(ctx->MaxBatchSize()) + - " for model '" + ctx->ModelName() + "'"); - return; - } - - // Prepare context for 'batch_size' batches. Request that all - // outputs be returned. - std::unique_ptr options; - *err = nic::InferContext::Options::Create(&options); - if (!err->IsOk()) { - return; - } - - options->SetBatchSize(batch_size_); - for (const auto& output : ctx->Outputs()) { - options->AddRawResult(output); - } - - *err = ctx->SetRunOptions(*options); - if (!err->IsOk()) { - return; - } - - // Create a randomly initialized buffer that is large enough to - // provide the largest needed input. We (re)use this buffer for all - // input values. - size_t max_input_byte_size = 0; - for (const auto& input : ctx->Inputs()) { - max_input_byte_size = std::max(max_input_byte_size, input->ByteSize()); - } - - std::vector input_buf(max_input_byte_size); - for (size_t i = 0; i < input_buf.size(); ++i) { - input_buf[i] = rand(); - } - - // Initialize inputs to use random values... - for (const auto& input : ctx->Inputs()) { - *err = input->Reset(); - if (!err->IsOk()) { - return; - } - - for (size_t i = 0; i < batch_size_; ++i) { - *err = input->SetRaw(&input_buf[0], input->ByteSize()); - if (!err->IsOk()) { - return; - } - } - } - - std::map requests_start_time; - // run inferencing until receiving exit signal to maintain server load. - do { - // Run inference to get output - std::vector> results; - std::shared_ptr request; - - // Create async requests such that the number of ongoing requests - // matches the concurrency level (here is '*pause_index') - while (requests_start_time.size() < *pause_index) { - struct timespec start_time; - clock_gettime(CLOCK_MONOTONIC, &start_time); - *err = ctx->AsyncRun(&request); - if (!err->IsOk()) { - return; - } - requests_start_time.emplace(request->Id(), start_time); - } - - if (requests_start_time.size() < *pause_index) { - std::cerr << "This message shouldn't be printed twice in a row" - << std::endl; - } - - // Get any request that is completed and - // record the end time of the request - while (true) { - nic::Error tmp_err; - if (requests_start_time.size() >= *pause_index) { - tmp_err = ctx->GetReadyAsyncRequest(&request, true); - } else { - // Don't wait if worker needs to maintain concurrency level - // Just make sure all completed requests at the moment - // are measured correctly - tmp_err = ctx->GetReadyAsyncRequest(&request, false); - } - - if (tmp_err.Code() == ni::RequestStatusCode::UNAVAILABLE) { - break; - } else if (!tmp_err.IsOk()) { - *err = tmp_err; - return; - } - *err = ctx->GetAsyncRunResults(&results, request, true); - - struct timespec end_time; - clock_gettime(CLOCK_MONOTONIC, &end_time); - - if (!err->IsOk()) { - return; - } - - auto itr = requests_start_time.find(request->Id()); - struct timespec start_time = itr->second; - requests_start_time.erase(itr); - - // Add the request timestamp to shared vector with proper locking - status_report_mutex_.lock(); - // Critical section - request_timestamps_->emplace_back(std::make_pair(start_time, end_time)); - // Update its InferContext statistic to shared Stat pointer - ctx->GetStat(stat.get()); - status_report_mutex_.unlock(); - } - - // Stop inferencing if an early exit has been signaled. - } while (!early_exit); - } - - // Used for measurement - nic::Error Measure(PerfStatus& status_summary) - { - nic::Error err(ni::RequestStatusCode::SUCCESS); - - ni::ModelStatus start_status; - ni::ModelStatus end_status; - nic::InferContext::Stat start_stat; - nic::InferContext::Stat end_stat; - - err = GetModelStatus(&start_status); - if (!err.IsOk()) { - return err; - } - // Start profiling on the server if requested. - if (profile_) { - err = StartProfile(); - if (!err.IsOk()) { - return err; - } - } - - err = GetAccumulatedContextStat(&start_stat); - - // Wait for specified time interval in msec - std::this_thread::sleep_for( - std::chrono::milliseconds((uint64_t)(measurement_window_ms_ * 1.2))); - - err = GetAccumulatedContextStat(&end_stat); - - // Stop profiling on the server if requested. - if (profile_) { - err = StopProfile(); - if (!err.IsOk()) { - return err; - } - } - - // Get server status and then print report on difference between - // before and after status. - err = GetModelStatus(&end_status); - if (!err.IsOk()) { - return err; - } - - err = - Summarize(status_summary, start_status, end_status, start_stat, end_stat); - if (!err.IsOk()) { - return err; - } - - return nic::Error(ni::RequestStatusCode::SUCCESS); - } - - bool verbose_; - bool profile_; - size_t batch_size_; - double stable_offset_; - uint64_t measurement_window_ms_; - size_t max_measurement_count_; - bool async_; - std::string model_name_; - int model_version_; - std::string url_; - ProtocolType protocol_; - - // Note: early_exit signal is kept global - std::vector threads_; - std::vector> threads_status_; - std::vector> threads_context_stat_; - - // pause_index_ tells threads (with idx >= pause_index_) to pause sending - // requests such that load level can decrease without terminating threads. - std::shared_ptr pause_index_; - // Use condition variable to pause/continue worker threads - std::condition_variable wake_signal_; - std::mutex wake_mutex_; - - // Pointer to a vector of request timestamps - // Request latency will be end_time - start_time - std::shared_ptr request_timestamps_; - // Mutex to avoid race condition on adding elements into the timestamp vector - // and on updating context statistic. - std::mutex status_report_mutex_; -}; - -ProtocolType -ParseProtocol(const std::string& str) -{ - std::string protocol(str); - std::transform(protocol.begin(), protocol.end(), protocol.begin(), ::tolower); - if (protocol == "http") { - return ProtocolType::HTTP; - } else if (protocol == "grpc") { - return ProtocolType::GRPC; - } - - std::cerr << "unexpected protocol type \"" << str - << "\", expecting HTTP or gRPC" << std::endl; - exit(1); - - return ProtocolType::HTTP; -} - -nic::Error -Report( - const PerfStatus& summary, const size_t concurrent_request_count, - const ProtocolType protocol, const bool verbose) -{ - const uint64_t cnt = summary.server_request_count; - - const uint64_t cumm_time_us = summary.server_cumm_time_ns / 1000; - const uint64_t cumm_avg_us = cumm_time_us / cnt; - - const uint64_t queue_time_us = summary.server_queue_time_ns / 1000; - const uint64_t queue_avg_us = queue_time_us / cnt; - - const uint64_t compute_time_us = summary.server_compute_time_ns / 1000; - const uint64_t compute_avg_us = compute_time_us / cnt; - - const uint64_t avg_latency_us = summary.client_avg_latency_ns / 1000; - const uint64_t std_us = summary.std_us; - - const uint64_t avg_request_time_us = - summary.client_avg_request_time_ns / 1000; - const uint64_t avg_send_time_us = summary.client_avg_send_time_ns / 1000; - const uint64_t avg_receive_time_us = - summary.client_avg_receive_time_ns / 1000; - const uint64_t avg_response_wait_time_us = - avg_request_time_us - avg_send_time_us - avg_receive_time_us; - - std::string client_library_detail = " "; - if (protocol == ProtocolType::GRPC) { - client_library_detail += - "Avg gRPC time: " + - std::to_string( - avg_send_time_us + avg_receive_time_us + avg_request_time_us) + - " usec ("; - if (!verbose) { - client_library_detail += - "(un)marshal request/response " + - std::to_string(avg_send_time_us + avg_receive_time_us) + - " usec + response wait " + std::to_string(avg_request_time_us) + - " usec)"; - } else { - client_library_detail += - "marshal " + std::to_string(avg_send_time_us) + - " usec + response wait " + std::to_string(avg_request_time_us) + - " usec + unmarshal " + std::to_string(avg_receive_time_us) + " usec)"; - } - } else { - client_library_detail += - "Avg HTTP time: " + std::to_string(avg_request_time_us) + " usec ("; - if (!verbose) { - client_library_detail += - "send/recv " + std::to_string(avg_send_time_us + avg_receive_time_us) + - " usec + response wait " + std::to_string(avg_response_wait_time_us) + - " usec)"; - } else { - client_library_detail += - "send " + std::to_string(avg_send_time_us) + " usec + response wait " + - std::to_string(avg_response_wait_time_us) + " usec + receive " + - std::to_string(avg_receive_time_us) + " usec)"; - } - } - - std::cout << " Client: " << std::endl - << " Request count: " << summary.client_request_count - << std::endl - << " Throughput: " << summary.client_infer_per_sec - << " infer/sec" << std::endl - << " Avg latency: " << avg_latency_us << " usec" - << " (standard deviation " << std_us << " usec)" << std::endl - << client_library_detail << std::endl - << " Server: " << std::endl - << " Request count: " << cnt << std::endl - << " Avg request latency: " << cumm_avg_us << " usec" - << " (overhead " << (cumm_avg_us - queue_avg_us - compute_avg_us) - << " usec + " - << "queue " << queue_avg_us << " usec + " - << "compute " << compute_avg_us << " usec)" << std::endl - << std::endl; - - return nic::Error(ni::RequestStatusCode::SUCCESS); -} - -void -Usage(char** argv, const std::string& msg = std::string()) -{ - if (!msg.empty()) { - std::cerr << "error: " << msg << std::endl; - } - - std::cerr << "Usage: " << argv[0] << " [options]" << std::endl; - std::cerr << "\t-v" << std::endl; - std::cerr << "\t-f " << std::endl; - std::cerr << "\t-b " << std::endl; - std::cerr << "\t-t " << std::endl; - std::cerr << "\t-d" << std::endl; - std::cerr << "\t-a" << std::endl; - std::cerr << "\t-l " << std::endl; - std::cerr << "\t-c " << std::endl; - std::cerr << "\t-s " << std::endl; - std::cerr << "\t-p " << std::endl; - std::cerr << "\t-r " - << std::endl; - std::cerr << "\t-n" << std::endl; - std::cerr << "\t-m " << std::endl; - std::cerr << "\t-x " << std::endl; - std::cerr << "\t-u " << std::endl; - std::cerr << "\t-i " - << std::endl; - std::cerr << std::endl; - std::cerr - << "The -d flag enables dynamic concurrent request count where the number" - << " of concurrent requests will increase linearly until the request" - << " latency is above the threshold set (see -l)." << std::endl; - std::cerr << "The -a flag changes the way to maintain concurrency level from" - << " sending synchronous requests to sending asynchrnous requests." - << std::endl; - std::cerr - << "For -t, it indicates the number of starting concurrent requests if -d" - << " flag is set." << std::endl; - std::cerr - << "For -s, it indicates the deviation threshold for the measurements. The" - << " measurement is considered as stable if the recent 3 measurements are " - << "within +/- (deviation threshold)% of their average in terms of both " - << "infer per second and latency. Default is 10(%)" << std::endl; - std::cerr - << "For -c, it indicates the maximum number of concurrent requests allowed" - << " if -d flag is set. Once the number of concurrent requests exceeds the" - << " maximum, the perf client will stop and exit regardless of the latency" - << " threshold. Default is 0 to indicate that no limit is set on the number" - << " of concurrent requests." << std::endl; - std::cerr - << "For -p, it indicates the time interval used for each measurement." - << " The perf client will sample a time interval specified by -p and" - << " take measurement over the requests completed" - << " within that time interval." << std::endl; - std::cerr - << "For -r, it indicates the maximum number of measurements for each" - << " profiling setting. The perf client will take multiple measurements and" - << " report the measurement until it is stable. The perf client will abort" - << " if the measurement is still unstable after the maximum number of" - << " measuremnts." << std::endl; - std::cerr << "For -l, it has no effect unless -d flag is set." << std::endl; - std::cerr << "The -n flag enables profiling for the duration of the run" - << std::endl; - std::cerr - << "If -x is not specified the most recent version (that is, the highest " - << "numbered version) of the model will be used." << std::endl; - std::cerr << "For -i, available protocols are gRPC and HTTP. Default is HTTP." - << std::endl; - - exit(1); -} - -} // namespace - -int -main(int argc, char** argv) -{ - bool verbose = false; - bool profile = false; - bool dynamic_concurrency_mode = false; - bool profiling_asynchronous_infer = false; - uint64_t latency_threshold_ms = 0; - int32_t batch_size = 1; - int32_t concurrent_request_count = 1; - size_t max_concurrency = 0; - double stable_offset = 0.1; - uint64_t measurement_window_ms = 0; - size_t max_measurement_count = 10; - std::string model_name; - int model_version = -1; - std::string url("localhost:8000"); - std::string filename(""); - ProtocolType protocol = ProtocolType::HTTP; - - // Parse commandline... - int opt; - while ((opt = getopt(argc, argv, "vndac:u:m:x:b:t:p:i:l:r:s:f:")) != -1) { - switch (opt) { - case 'v': - verbose = true; - break; - case 'n': - profile = true; - break; - case 'd': - dynamic_concurrency_mode = true; - break; - case 'u': - url = optarg; - break; - case 'm': - model_name = optarg; - break; - case 'x': - model_version = atoi(optarg); - break; - case 'b': - batch_size = atoi(optarg); - break; - case 't': - concurrent_request_count = atoi(optarg); - break; - case 'p': - measurement_window_ms = atoi(optarg); - break; - case 'i': - protocol = ParseProtocol(optarg); - break; - case 'l': - latency_threshold_ms = atoi(optarg); - break; - case 'c': - max_concurrency = atoi(optarg); - break; - case 'r': - max_measurement_count = atoi(optarg); - break; - case 's': - stable_offset = atof(optarg) / 100; - break; - case 'f': - filename = optarg; - break; - case 'a': - profiling_asynchronous_infer = true; - break; - case '?': - Usage(argv); - break; - } - } - - if (model_name.empty()) { - Usage(argv, "-m flag must be specified"); - } - if (batch_size <= 0) { - Usage(argv, "batch size must be > 0"); - } - if (measurement_window_ms <= 0) { - Usage(argv, "measurement window must be > 0 in msec"); - } - if (concurrent_request_count <= 0) { - Usage(argv, "concurrent request count must be > 0"); - } - if (dynamic_concurrency_mode && latency_threshold_ms < 0) { - Usage(argv, "latency threshold must be >= 0 for dynamic concurrency mode"); - } - - // trap SIGINT to allow threads to exit gracefully - signal(SIGINT, SignalHandler); - - nic::Error err(ni::RequestStatusCode::SUCCESS); - std::unique_ptr manager; - err = ConcurrencyManager::Create( - &manager, verbose, profile, batch_size, stable_offset, - measurement_window_ms, max_measurement_count, profiling_asynchronous_infer, - model_name, model_version, url, protocol); - if (!err.IsOk()) { - std::cerr << err << std::endl; - return 1; - } - - // pre-run report - std::cout << "*** Measurement Settings ***" << std::endl - << " Batch size: " << batch_size << std::endl - << " Measurement window: " << measurement_window_ms << " msec" - << std::endl; - if (dynamic_concurrency_mode) { - std::cout << " Latency limit: " << latency_threshold_ms << " msec" - << std::endl; - if (max_concurrency != 0) { - std::cout << " Concurrency limit: " << max_concurrency - << " concurrent requests" << std::endl; - } - } - std::cout << std::endl; - - PerfStatus status_summary; - std::vector summary; - if (!dynamic_concurrency_mode) { - err = manager->Step(status_summary, concurrent_request_count); - if (err.IsOk()) { - err = Report(status_summary, concurrent_request_count, protocol, verbose); - } - } else { - for (size_t count = concurrent_request_count; - (count <= max_concurrency) || (max_concurrency == 0); count++) { - err = manager->Step(status_summary, count); - if (err.IsOk()) { - err = Report(status_summary, count, protocol, verbose); - summary.push_back(status_summary); - uint64_t avg_latency_ms = - status_summary.client_avg_latency_ns / (1000 * 1000); - if ((avg_latency_ms >= latency_threshold_ms) || !err.IsOk()) { - std::cerr << err << std::endl; - break; - } - } else { - break; - } - } - } - if (!err.IsOk()) { - std::cerr << err << std::endl; - return 1; - } - if (summary.size()) { - std::ofstream ofs(filename, std::ofstream::out); - // Can print more depending on verbose, but it seems too much information - std::cout << "Inferences/Second vs. Client Average Batch Latency" - << std::endl; - if (!filename.empty()) { - ofs << "Concurrency,Inferences/Second,Client Send," - << "Network+Server Send/Recv,Server Queue," - << "Server Compute,Client Recv" << std::endl; - } - - for (PerfStatus& status : summary) { - std::cout << "Concurrency: " << status.concurrency << ", " - << status.client_infer_per_sec << " infer/sec, latency " - << (status.client_avg_latency_ns / 1000) << " usec" - << std::endl; - } - - if (!filename.empty()) { - // Sort summary results in order of increasing infer/sec. - std::sort( - summary.begin(), summary.end(), - [](const PerfStatus& a, const PerfStatus& b) -> bool { - return a.client_infer_per_sec < b.client_infer_per_sec; - }); - - for (PerfStatus& status : summary) { - uint64_t avg_queue_ns = - status.server_queue_time_ns / status.server_request_count; - uint64_t avg_compute_ns = - status.server_compute_time_ns / status.server_request_count; - uint64_t avg_network_misc_ns = - status.client_avg_latency_ns - avg_queue_ns - avg_compute_ns - - status.client_avg_send_time_ns - status.client_avg_receive_time_ns; - - ofs << status.concurrency << "," << status.client_infer_per_sec << "," - << (status.client_avg_send_time_ns / 1000) << "," - << (avg_network_misc_ns / 1000) << "," << (avg_queue_ns / 1000) - << "," << (avg_compute_ns / 1000) << "," - << (status.client_avg_receive_time_ns / 1000) << std::endl; - } - } - ofs.close(); - } - return 0; -} diff --git a/src/clients/c++/request.cc b/src/clients/c++/request.cc deleted file mode 100644 index 8b1f2f508f..0000000000 --- a/src/clients/c++/request.cc +++ /dev/null @@ -1,2460 +0,0 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "src/clients/c++/request.h" - -#include -#include -#include -#include -#include "src/core/constants.h" - -namespace nvidia { namespace inferenceserver { namespace client { - -//============================================================================== - -// Global initialization for libcurl. Libcurl requires global -// initialization before any other threads are created and before any -// curl methods are used. The curl_global static object is used to -// perform this initialization. -class CurlGlobal { - public: - CurlGlobal(); - ~CurlGlobal(); - - const Error& Status() const { return err_; } - - private: - Error err_; -}; - -CurlGlobal::CurlGlobal() : err_(RequestStatusCode::SUCCESS) -{ - if (curl_global_init(CURL_GLOBAL_ALL) != 0) { - err_ = Error(RequestStatusCode::INTERNAL, "global initialization failed"); - } -} - -CurlGlobal::~CurlGlobal() -{ - curl_global_cleanup(); -} - -static CurlGlobal curl_global; - -//============================================================================== - -// Use map to keep track of gRPC channels. : -// If context is created on url that has established Channel, then reuse it. -std::map> grpc_channel_map_; -std::shared_ptr -GetChannel(const std::string& url) -{ - const auto& channel_itr = grpc_channel_map_.find(url); - if (channel_itr != grpc_channel_map_.end()) { - return channel_itr->second; - } else { - grpc::ChannelArguments arguments; - arguments.SetMaxSendMessageSize(MAX_GRPC_MESSAGE_SIZE); - arguments.SetMaxReceiveMessageSize(MAX_GRPC_MESSAGE_SIZE); - std::shared_ptr channel = grpc::CreateCustomChannel( - url, grpc::InsecureChannelCredentials(), arguments); - grpc_channel_map_.insert(std::make_pair(url, channel)); - return channel; - } -} - -//============================================================================== - -const Error Error::Success(RequestStatusCode::SUCCESS); - -Error::Error(RequestStatusCode code, const std::string& msg) - : code_(code), msg_(msg), request_id_(0) -{ -} - -Error::Error(RequestStatusCode code) : code_(code), request_id_(0) {} - -Error::Error(const RequestStatus& status) : Error(status.code(), status.msg()) -{ - server_id_ = status.server_id(); - request_id_ = status.request_id(); -} - -std::ostream& -operator<<(std::ostream& out, const Error& err) -{ - out << "[" << err.server_id_ << " " << err.request_id_ << "] " - << RequestStatusCode_Name(err.code_); - if (!err.msg_.empty()) { - out << " - " << err.msg_; - } - return out; -} - -//============================================================================== - -ServerHealthContext::ServerHealthContext(bool verbose) : verbose_(verbose) {} - -//============================================================================== - -ServerStatusContext::ServerStatusContext(bool verbose) : verbose_(verbose) {} - -//============================================================================== - -class OptionsImpl : public InferContext::Options { - public: - OptionsImpl(); - ~OptionsImpl() = default; - - size_t BatchSize() const override { return batch_size_; } - void SetBatchSize(size_t batch_size) override { batch_size_ = batch_size; } - - Error AddRawResult( - const std::shared_ptr& output) override; - Error AddClassResult( - const std::shared_ptr& output, uint64_t k) override; - - // Options for an output - struct OutputOptions { - OutputOptions(InferContext::Result::ResultFormat f, uint64_t n = 0) - : result_format(f), u64(n) - { - } - InferContext::Result::ResultFormat result_format; - uint64_t u64; - }; - - using OutputOptionsPair = - std::pair, OutputOptions>; - - const std::vector& Outputs() const { return outputs_; } - - private: - size_t batch_size_; - std::vector outputs_; -}; - -OptionsImpl::OptionsImpl() : batch_size_(0) {} - -Error -OptionsImpl::AddRawResult(const std::shared_ptr& output) -{ - outputs_.emplace_back(std::make_pair( - output, OutputOptions(InferContext::Result::ResultFormat::RAW))); - return Error::Success; -} - -Error -OptionsImpl::AddClassResult( - const std::shared_ptr& output, uint64_t k) -{ - outputs_.emplace_back(std::make_pair( - output, OutputOptions(InferContext::Result::ResultFormat::CLASS, k))); - return Error::Success; -} - -Error -InferContext::Options::Create(std::unique_ptr* options) -{ - options->reset(new OptionsImpl()); - return Error::Success; -} - -//============================================================================== - -class InputImpl : public InferContext::Input { - public: - InputImpl(const ModelInput& mio); - InputImpl(const InputImpl& obj); - ~InputImpl() = default; - - const std::string& Name() const override { return mio_.name(); } - size_t ByteSize() const override { return byte_size_; } - DataType DType() const override { return mio_.data_type(); } - ModelInput::Format Format() const override { return mio_.format(); } - const DimsList& Dims() const override { return mio_.dims(); } - - void SetBatchSize(size_t batch_size) { batch_size_ = batch_size; } - - Error Reset() override; - Error SetRaw(const std::vector& input) override; - Error SetRaw(const uint8_t* input, size_t input_byte_size) override; - - // Copy into 'buf' up to 'size' bytes of this input's data. Return - // the actual amount copied in 'input_bytes' and if the end of input - // is reached in 'end_of_input' - Error GetNext( - uint8_t* buf, size_t size, size_t* input_bytes, bool* end_of_input); - - // Copy the pointer of the raw buffer at 'batch_idx' into 'buf' - Error GetRaw(size_t batch_idx, const uint8_t** buf) const; - - // Prepare to send this input as part of a request. - Error PrepareForRequest(); - - private: - const ModelInput mio_; - const size_t byte_size_; - size_t batch_size_; - std::vector bufs_; - size_t bufs_idx_, buf_pos_; -}; - -InputImpl::InputImpl(const ModelInput& mio) - : mio_(mio), byte_size_(GetSize(mio)), batch_size_(0), bufs_idx_(0), - buf_pos_(0) -{ -} - -InputImpl::InputImpl(const InputImpl& obj) - : mio_(obj.mio_), byte_size_(obj.byte_size_), batch_size_(obj.batch_size_), - bufs_idx_(0), buf_pos_(0) -{ - // Set raw inputs - for (size_t batch_idx = 0; batch_idx < batch_size_; batch_idx++) { - const uint8_t* data_ptr; - obj.GetRaw(batch_idx, &data_ptr); - SetRaw(data_ptr, byte_size_); - } -} - -Error -InputImpl::SetRaw(const uint8_t* input, size_t input_byte_size) -{ - if (input_byte_size != byte_size_) { - bufs_.clear(); - return Error( - RequestStatusCode::INVALID_ARG, - "invalid size " + std::to_string(input_byte_size) + " bytes for input '" + - Name() + "', expects " + std::to_string(byte_size_) + " bytes"); - } - - if (bufs_.size() >= batch_size_) { - bufs_.clear(); - return Error( - RequestStatusCode::INVALID_ARG, "expecting " + - std::to_string(batch_size_) + - " invocations of SetRaw for input '" + - Name() + "', one per batch entry"); - } - - bufs_.push_back(input); - return Error::Success; -} - -Error -InputImpl::SetRaw(const std::vector& input) -{ - return SetRaw(&input[0], input.size()); -} - -Error -InputImpl::GetNext( - uint8_t* buf, size_t size, size_t* input_bytes, bool* end_of_input) -{ - size_t total_size = 0; - - while ((bufs_idx_ < bufs_.size()) && (size > 0)) { - const size_t csz = std::min(byte_size_ - buf_pos_, size); - if (csz > 0) { - const uint8_t* input_ptr = bufs_[bufs_idx_] + buf_pos_; - std::copy(input_ptr, input_ptr + csz, buf); - buf_pos_ += csz; - buf += csz; - size -= csz; - total_size += csz; - } - - if (buf_pos_ == byte_size_) { - bufs_idx_++; - buf_pos_ = 0; - } - } - - *input_bytes = total_size; - *end_of_input = (bufs_idx_ >= bufs_.size()); - return Error::Success; -} - -Error -InputImpl::GetRaw(size_t batch_idx, const uint8_t** buf) const -{ - if (batch_idx >= batch_size_) { - return Error( - RequestStatusCode::INVALID_ARG, - "unexpected batch entry " + std::to_string(batch_idx) + - " requested for input '" + Name() + "', batch size is " + - std::to_string(batch_size_)); - } - - *buf = bufs_[batch_idx]; - return Error::Success; -} - -Error -InputImpl::Reset() -{ - bufs_.clear(); - bufs_idx_ = 0; - buf_pos_ = 0; - return Error::Success; -} - -Error -InputImpl::PrepareForRequest() -{ - if (bufs_.size() != batch_size_) { - return Error( - RequestStatusCode::INVALID_ARG, - "expecting " + std::to_string(batch_size_) + - " invocations of SetRaw for input '" + Name() + "', have " + - std::to_string(bufs_.size())); - } - - // Reset position so request sends entire input. - bufs_idx_ = 0; - buf_pos_ = 0; - return Error::Success; -} - -//============================================================================== - -class OutputImpl : public InferContext::Output { - public: - OutputImpl(const ModelOutput& mio); - ~OutputImpl() = default; - - const std::string& Name() const override { return mio_.name(); } - size_t ByteSize() const override { return byte_size_; } - DataType DType() const override { return mio_.data_type(); } - const DimsList& Dims() const override { return mio_.dims(); } - - InferContext::Result::ResultFormat ResultFormat() const - { - return result_format_; - } - void SetResultFormat(InferContext::Result::ResultFormat result_format) - { - result_format_ = result_format; - } - - private: - const ModelOutput mio_; - const size_t byte_size_; - InferContext::Result::ResultFormat result_format_; -}; - -OutputImpl::OutputImpl(const ModelOutput& mio) - : mio_(mio), byte_size_(GetSize(mio)), - result_format_(InferContext::Result::ResultFormat::RAW) -{ -} - -//============================================================================== - -class ResultImpl : public InferContext::Result { - public: - ResultImpl( - const std::shared_ptr& output, uint64_t batch_size, - InferContext::Result::ResultFormat result_format); - ~ResultImpl() = default; - - const std::string& ModelName() const override { return model_name_; } - uint32_t ModelVersion() const override { return model_version_; } - - const std::shared_ptr GetOutput() const override - { - return output_; - } - - Error GetRaw( - size_t batch_idx, const std::vector** buf) const override; - Error GetRawAtCursor( - size_t batch_idx, const uint8_t** buf, size_t adv_byte_size) override; - Error GetClassCount(size_t batch_idx, size_t* cnt) const override; - Error GetClassAtCursor(size_t batch_idx, ClassResult* result) override; - Error ResetCursors() override; - Error ResetCursor(size_t batch_idx) override; - - // Get the result format for this result. - InferContext::Result::ResultFormat ResultFormat() const - { - return result_format_; - } - - // Set information about the model that produced this result. - void SetModel(const std::string& name, const uint32_t version) - { - model_name_ = name; - model_version_ = version; - } - - // Set results for a CLASS format result. - void SetClassResult(const InferResponseHeader::Output& result) - { - class_result_ = result; - } - - // For RAW format result, copy into the output up to 'size' bytes of - // output data from 'buf'. Return the actual amount copied in - // 'result_bytes'. - Error SetNextRawResult(const uint8_t* buf, size_t size, size_t* result_bytes); - - private: - const std::shared_ptr output_; - const size_t byte_size_; - const size_t batch_size_; - const InferContext::Result::ResultFormat result_format_; - - std::vector> bufs_; - size_t bufs_idx_; - std::vector bufs_pos_; - - std::string model_name_; - uint32_t model_version_; - - InferResponseHeader::Output class_result_; - std::vector class_pos_; -}; - -ResultImpl::ResultImpl( - const std::shared_ptr& output, uint64_t batch_size, - InferContext::Result::ResultFormat result_format) - : output_(output), byte_size_(output->ByteSize()), batch_size_(batch_size), - result_format_(result_format), bufs_(batch_size), bufs_idx_(0), - bufs_pos_(batch_size), class_pos_(batch_size) -{ -} - -Error -ResultImpl::GetRaw(size_t batch_idx, const std::vector** buf) const -{ - if (result_format_ != InferContext::Result::ResultFormat::RAW) { - return Error( - RequestStatusCode::UNSUPPORTED, - "raw result not available for non-RAW output '" + output_->Name() + "'"); - } - - if (batch_idx >= batch_size_) { - return Error( - RequestStatusCode::INVALID_ARG, - "unexpected batch entry " + std::to_string(batch_idx) + - " requested for output '" + output_->Name() + "', batch size is " + - std::to_string(batch_size_)); - } - - *buf = &bufs_[batch_idx]; - return Error::Success; -} - -Error -ResultImpl::GetRawAtCursor( - size_t batch_idx, const uint8_t** buf, size_t adv_byte_size) -{ - if (result_format_ != InferContext::Result::ResultFormat::RAW) { - return Error( - RequestStatusCode::UNSUPPORTED, - "raw result not available for non-RAW output '" + output_->Name() + "'"); - } - - if (batch_idx >= batch_size_) { - return Error( - RequestStatusCode::INVALID_ARG, - "unexpected batch entry " + std::to_string(batch_idx) + - "requested for output '" + output_->Name() + "', batch size is " + - std::to_string(batch_size_)); - } - - if ((bufs_pos_[batch_idx] + adv_byte_size) > byte_size_) { - return Error( - RequestStatusCode::UNSUPPORTED, - "attempt to read beyond end of result for output output '" + - output_->Name() + "'"); - } - - *buf = &bufs_[batch_idx][bufs_pos_[batch_idx]]; - bufs_pos_[batch_idx] += adv_byte_size; - return Error::Success; -} - -Error -ResultImpl::GetClassCount(size_t batch_idx, size_t* cnt) const -{ - if (result_format_ != InferContext::Result::ResultFormat::CLASS) { - return Error( - RequestStatusCode::UNSUPPORTED, - "class result not available for non-CLASS output '" + output_->Name() + - "'"); - } - - // Number of classifications should equal expected batch size but - // check both to be careful and to protext class_pos_ accesses. - if ( - (batch_idx >= (size_t)class_result_.batch_classes().size()) || - (batch_idx >= batch_size_)) { - return Error( - RequestStatusCode::INVALID_ARG, - "unexpected batch entry " + std::to_string(batch_idx) + - "requested for output '" + output_->Name() + "', batch size is " + - std::to_string(batch_size_)); - } - - const InferResponseHeader::Output::Classes& classes = - class_result_.batch_classes(batch_idx); - - *cnt = classes.cls().size(); - return Error::Success; -} - -Error -ResultImpl::GetClassAtCursor( - size_t batch_idx, InferContext::Result::ClassResult* result) -{ - if (result_format_ != InferContext::Result::ResultFormat::CLASS) { - return Error( - RequestStatusCode::UNSUPPORTED, - "class result not available for non-CLASS output '" + output_->Name() + - "'"); - } - - // Number of classifications should equal expected batch size but - // check both to be careful and to protext class_pos_ accesses. - if ( - (batch_idx >= (size_t)class_result_.batch_classes().size()) || - (batch_idx >= batch_size_)) { - return Error( - RequestStatusCode::INVALID_ARG, - "unexpected batch entry " + std::to_string(batch_idx) + - "requested for output '" + output_->Name() + "', batch size is " + - std::to_string(batch_size_)); - } - - const InferResponseHeader::Output::Classes& classes = - class_result_.batch_classes(batch_idx); - - if (class_pos_[batch_idx] >= (size_t)classes.cls().size()) { - return Error( - RequestStatusCode::UNSUPPORTED, - "attempt to read beyond end of result for output output '" + - output_->Name() + "'"); - } - - const InferResponseHeader::Output::Class& cls = - classes.cls(class_pos_[batch_idx]); - - result->idx = cls.idx(); - result->value = cls.value(); - result->label = cls.label(); - - class_pos_[batch_idx]++; - return Error::Success; -} - -Error -ResultImpl::ResetCursors() -{ - std::fill(bufs_pos_.begin(), bufs_pos_.end(), 0); - std::fill(class_pos_.begin(), class_pos_.end(), 0); - return Error::Success; -} - -Error -ResultImpl::ResetCursor(size_t batch_idx) -{ - if (batch_idx >= batch_size_) { - return Error( - RequestStatusCode::INVALID_ARG, - "unexpected batch entry " + std::to_string(batch_idx) + - "requested for output '" + output_->Name() + "', batch size is " + - std::to_string(batch_size_)); - } - - bufs_pos_[batch_idx] = 0; - class_pos_[batch_idx] = 0; - return Error::Success; -} - -Error -ResultImpl::SetNextRawResult( - const uint8_t* buf, size_t size, size_t* result_bytes) -{ - size_t total_size = 0; - - while ((bufs_idx_ < bufs_.size()) && (size > 0)) { - const size_t csz = std::min(byte_size_ - bufs_pos_[bufs_idx_], size); - if (csz > 0) { - std::copy(buf, buf + csz, std::back_inserter(bufs_[bufs_idx_])); - bufs_pos_[bufs_idx_] += csz; - buf += csz; - size -= csz; - total_size += csz; - } - - if (bufs_pos_[bufs_idx_] == byte_size_) { - bufs_idx_++; - } - } - - *result_bytes = total_size; - return Error::Success; -} - -//============================================================================== - -InferContext::RequestTimers::RequestTimers() -{ - Reset(); -} - -Error -InferContext::RequestTimers::Reset() -{ - request_start_.tv_sec = 0; - request_end_.tv_sec = 0; - send_start_.tv_sec = 0; - send_end_.tv_sec = 0; - receive_start_.tv_sec = 0; - receive_end_.tv_sec = 0; - request_start_.tv_nsec = 0; - request_end_.tv_nsec = 0; - send_start_.tv_nsec = 0; - send_end_.tv_nsec = 0; - receive_start_.tv_nsec = 0; - receive_end_.tv_nsec = 0; - return Error::Success; -} - -Error -InferContext::RequestTimers::Record(Kind kind) -{ - switch (kind) { - case Kind::REQUEST_START: - clock_gettime(CLOCK_MONOTONIC, &request_start_); - break; - case Kind::REQUEST_END: - clock_gettime(CLOCK_MONOTONIC, &request_end_); - break; - case Kind::SEND_START: - clock_gettime(CLOCK_MONOTONIC, &send_start_); - break; - case Kind::SEND_END: - clock_gettime(CLOCK_MONOTONIC, &send_end_); - break; - case Kind::RECEIVE_START: - clock_gettime(CLOCK_MONOTONIC, &receive_start_); - break; - case Kind::RECEIVE_END: - clock_gettime(CLOCK_MONOTONIC, &receive_end_); - break; - } - return Error::Success; -} - -//============================================================================== - -class RequestImpl : public InferContext::Request { - public: - virtual ~RequestImpl() = default; - - uint64_t Id() const { return id_; }; - - // Initialize 'requested_results_' according to 'batch_size' and - // 'requested_outs' as the placeholder for the results - Error InitializeRequestedResults( - const std::vector>& requested_outs, - const size_t batch_size); - - // Return the results of the request. 'ready_' should always be checked - // before calling GetResults() to ensure the request has been completed. - virtual Error GetResults( - std::vector>* results) = 0; - - protected: - RequestImpl(const uint64_t id); - - // Helper function called after inference to set non-RAW results in - // 'requested_results_'. - Error PostRunProcessing( - std::vector>& results, - const InferResponseHeader& infer_response); - - friend class InferContext; - - // Identifier seen by user - uint64_t id_; - - // Internal identifier for asynchronous call - uintptr_t run_index_; - - // Indicating if the request has been completed. - bool ready_; - - // The timer for infer request. - InferContext::RequestTimers timer_; - - // Results being collected for the requested outputs from inference - // server response. - std::vector> requested_results_; - - // Current positions within output vectors when processing response. - size_t result_pos_idx_; -}; - -RequestImpl::RequestImpl(const uint64_t id) - : id_(id), ready_(false), result_pos_idx_(0) -{ -} - -Error -RequestImpl::InitializeRequestedResults( - const std::vector>& requested_outs, - const size_t batch_size) -{ - // Initialize the results vector to collect the requested results. - requested_results_.clear(); - for (const auto& io : requested_outs) { - std::unique_ptr rp(new ResultImpl( - io, batch_size, reinterpret_cast(io.get())->ResultFormat())); - requested_results_.emplace_back(std::move(rp)); - } - return Error::Success; -} - -Error -RequestImpl::PostRunProcessing( - std::vector>& results, - const InferResponseHeader& infer_response) -{ - // At this point, the RAW requested results have their result values - // set. Now need to initialize non-RAW results. - for (auto& rr : results) { - ResultImpl* r = reinterpret_cast(rr.get()); - r->SetModel(infer_response.model_name(), infer_response.model_version()); - switch (r->ResultFormat()) { - case InferContext::Result::ResultFormat::RAW: - r->ResetCursors(); - break; - - case InferContext::Result::ResultFormat::CLASS: { - for (const auto& ir : infer_response.output()) { - if (ir.name() == r->GetOutput()->Name()) { - r->SetClassResult(ir); - break; - } - } - break; - } - } - } - return Error::Success; -} - -//============================================================================== - -InferContext::InferContext( - const std::string& model_name, int model_version, bool verbose) - : model_name_(model_name), model_version_(model_version), verbose_(verbose), - total_input_byte_size_(0), batch_size_(0), async_request_id_(0), - worker_(), exiting_(true) -{ -} - -Error -InferContext::GetInput( - const std::string& name, std::shared_ptr* input) const -{ - for (const auto& io : inputs_) { - if (io->Name() == name) { - *input = io; - return Error::Success; - } - } - - return Error( - RequestStatusCode::INVALID_ARG, - "unknown input '" + name + "' for '" + model_name_ + "'"); -} - -Error -InferContext::GetOutput( - const std::string& name, std::shared_ptr* output) const -{ - for (const auto& io : outputs_) { - if (io->Name() == name) { - *output = io; - return Error::Success; - } - } - - return Error( - RequestStatusCode::INVALID_ARG, - "unknown output '" + name + "' for '" + model_name_ + "'"); -} - -Error -InferContext::SetRunOptions(const InferContext::Options& boptions) -{ - const OptionsImpl& options = reinterpret_cast(boptions); - - // If the model doesn't support batching (i.e. max_batch_size_ == 0) - // then still allow batch size of 1 to be specified. - uint64_t effective_max_batch_size = std::max((uint64_t)1, max_batch_size_); - if (options.BatchSize() > effective_max_batch_size) { - return Error( - RequestStatusCode::INVALID_ARG, - "run batch-size " + std::to_string(options.BatchSize()) + - " exceeds maximum batch size " + - std::to_string(effective_max_batch_size) + " allowed for model '" + - model_name_ + "'"); - } - - // If batch-size 0 was requested (no batching) treat it like - // batch-size 1. - batch_size_ = std::max((uint64_t)1, options.BatchSize()); - total_input_byte_size_ = 0; - - // Create the InferRequestHeader protobuf. This protobuf will be - // used for all subsequent requests. - infer_request_.Clear(); - - infer_request_.set_batch_size(batch_size_); - - for (const auto& io : inputs_) { - reinterpret_cast(io.get())->SetBatchSize(batch_size_); - total_input_byte_size_ += io->ByteSize() * batch_size_; - - auto rinput = infer_request_.add_input(); - rinput->set_name(io->Name()); - rinput->set_byte_size(io->ByteSize()); - } - - requested_outputs_.clear(); - - for (const auto& p : options.Outputs()) { - const std::shared_ptr& output = p.first; - const OptionsImpl::OutputOptions& ooptions = p.second; - - reinterpret_cast(output.get()) - ->SetResultFormat(ooptions.result_format); - requested_outputs_.emplace_back(output); - - auto routput = infer_request_.add_output(); - routput->set_name(output->Name()); - routput->set_byte_size(output->ByteSize()); - if (ooptions.result_format == Result::ResultFormat::CLASS) { - routput->mutable_cls()->set_count(ooptions.u64); - } - } - - return Error::Success; -} - -Error -InferContext::GetStat(Stat* stat) -{ - stat->completed_request_count = context_stat_.completed_request_count; - stat->cumulative_total_request_time_ns = - context_stat_.cumulative_total_request_time_ns; - stat->cumulative_send_time_ns = context_stat_.cumulative_send_time_ns; - stat->cumulative_receive_time_ns = context_stat_.cumulative_receive_time_ns; - return Error::Success; -} - -Error -InferContext::UpdateStat(const RequestTimers& timer) -{ - uint64_t request_start_ns = timer.request_start_.tv_sec * NANOS_PER_SECOND + - timer.request_start_.tv_nsec; - uint64_t request_end_ns = - timer.request_end_.tv_sec * NANOS_PER_SECOND + timer.request_end_.tv_nsec; - uint64_t send_start_ns = - timer.send_start_.tv_sec * NANOS_PER_SECOND + timer.send_start_.tv_nsec; - uint64_t send_end_ns = - timer.send_end_.tv_sec * NANOS_PER_SECOND + timer.send_end_.tv_nsec; - uint64_t receive_start_ns = timer.receive_start_.tv_sec * NANOS_PER_SECOND + - timer.receive_start_.tv_nsec; - uint64_t receive_end_ns = - timer.receive_end_.tv_sec * NANOS_PER_SECOND + timer.receive_end_.tv_nsec; - if ( - (request_start_ns >= request_end_ns) || (send_start_ns > send_end_ns) || - (receive_start_ns > receive_end_ns)) { - return Error(RequestStatusCode::INVALID_ARG, "Timer not set correctly."); - } - - uint64_t request_time_ns = request_end_ns - request_start_ns; - uint64_t send_time_ns = send_end_ns - send_start_ns; - uint64_t receive_time_ns = receive_end_ns - receive_start_ns; - - context_stat_.completed_request_count++; - context_stat_.cumulative_total_request_time_ns += request_time_ns; - context_stat_.cumulative_send_time_ns += send_time_ns; - context_stat_.cumulative_receive_time_ns += receive_time_ns; - return Error::Success; -} - -Error -InferContext::GetReadyAsyncRequest(std::shared_ptr* request, bool wait) -{ - if (ongoing_async_requests_.size() == 0) { - return Error( - RequestStatusCode::UNAVAILABLE, - "No asynchronous requests have been sent"); - } - - Error err; - std::unique_lock lock(mutex_); - cv_.wait(lock, [&err, request, this, wait] { - for (auto& ongoing_async_request : this->ongoing_async_requests_) { - if (std::static_pointer_cast(ongoing_async_request.second) - ->ready_) { - *request = ongoing_async_request.second; - err = Error::Success; - return true; - } - } - - if (!wait) { - err = Error(RequestStatusCode::UNAVAILABLE, "No completed request."); - return true; - } else { - return false; - } - }); - - lock.unlock(); - return err; -} - -Error -InferContext::IsRequestReady( - const std::shared_ptr& async_request, bool wait) -{ - if (ongoing_async_requests_.size() == 0) { - return Error( - RequestStatusCode::INVALID_ARG, - "No asynchronous requests have been sent"); - } - - std::shared_ptr request = - std::static_pointer_cast(async_request); - - auto itr = ongoing_async_requests_.find(request->run_index_); - if (itr == ongoing_async_requests_.end()) { - return Error( - RequestStatusCode::INVALID_ARG, "No matched asynchronous request found."); - } - - Error err = Error::Success; - std::unique_lock lock(mutex_); - cv_.wait(lock, [&err, &request, wait] { - if (!request->ready_) { - if (wait) { - return false; - } else { - err = Error(RequestStatusCode::UNAVAILABLE, "Request is not ready."); - } - } - return true; - }); - - if (!err.IsOk()) { - lock.unlock(); - return err; - } else { - ongoing_async_requests_.erase(itr->first); - } - lock.unlock(); - return Error::Success; -} - -//============================================================================== - -ProfileContext::ProfileContext(bool verbose) : verbose_(verbose) {} - -Error -ProfileContext::StartProfile() -{ - return SendCommand("start"); -} - -Error -ProfileContext::StopProfile() -{ - return SendCommand("stop"); -} - -//============================================================================== - -Error -ServerHealthHttpContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose) -{ - ctx->reset(static_cast( - new ServerHealthHttpContext(server_url, verbose))); - return Error::Success; -} - -ServerHealthHttpContext::ServerHealthHttpContext( - const std::string& server_url, bool verbose) - : ServerHealthContext(verbose), url_(server_url + "/" + kHealthRESTEndpoint) -{ -} - -Error -ServerHealthHttpContext::GetHealth(const std::string& url, bool* health) -{ - if (!curl_global.Status().IsOk()) { - return curl_global.Status(); - } - - CURL* curl = curl_easy_init(); - if (!curl) { - return Error( - RequestStatusCode::INTERNAL, "failed to initialize HTTP client"); - } - - curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); - if (verbose_) { - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); - } - - CURLcode res = curl_easy_perform(curl); - if (res != CURLE_OK) { - curl_easy_cleanup(curl); - return Error( - RequestStatusCode::INTERNAL, - "HTTP client failed: " + std::string(curl_easy_strerror(res))); - } - - // Must use 64-bit integer with curl_easy_getinfo - int64_t http_code; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); - - curl_easy_cleanup(curl); - - *health = (http_code == 200) ? true : false; - - return Error::Success; -} - -Error -ServerHealthHttpContext::GetReady(bool* ready) -{ - return GetHealth(url_ + "/ready", ready); -} - -Error -ServerHealthHttpContext::GetLive(bool* live) -{ - return GetHealth(url_ + "/live", live); -} - -//============================================================================== - -Error -ServerStatusHttpContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose) -{ - ctx->reset(static_cast( - new ServerStatusHttpContext(server_url, verbose))); - return Error::Success; -} - -Error -ServerStatusHttpContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, bool verbose) -{ - ctx->reset(static_cast( - new ServerStatusHttpContext(server_url, model_name, verbose))); - return Error::Success; -} - -ServerStatusHttpContext::ServerStatusHttpContext( - const std::string& server_url, bool verbose) - : ServerStatusContext(verbose), url_(server_url + "/" + kStatusRESTEndpoint) -{ -} - -ServerStatusHttpContext::ServerStatusHttpContext( - const std::string& server_url, const std::string& model_name, bool verbose) - : ServerStatusContext(verbose), - url_(server_url + "/" + kStatusRESTEndpoint + "/" + model_name) -{ -} - -Error -ServerStatusHttpContext::GetServerStatus(ServerStatus* server_status) -{ - server_status->Clear(); - request_status_.Clear(); - response_.clear(); - - if (!curl_global.Status().IsOk()) { - return curl_global.Status(); - } - - CURL* curl = curl_easy_init(); - if (!curl) { - return Error( - RequestStatusCode::INTERNAL, "failed to initialize HTTP client"); - } - - // Want binary representation of the status. - std::string full_url = url_ + "?format=binary"; - curl_easy_setopt(curl, CURLOPT_URL, full_url.c_str()); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); - if (verbose_) { - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); - } - - // response headers handled by ResponseHeaderHandler() - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, ResponseHeaderHandler); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, this); - - // response data handled by ResponseHandler() - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, this); - - CURLcode res = curl_easy_perform(curl); - if (res != CURLE_OK) { - curl_easy_cleanup(curl); - return Error( - RequestStatusCode::INTERNAL, - "HTTP client failed: " + std::string(curl_easy_strerror(res))); - } - - // Must use 64-bit integer with curl_easy_getinfo - int64_t http_code; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); - - curl_easy_cleanup(curl); - - // Should have a request status, if not then create an error status. - if (request_status_.code() == RequestStatusCode::INVALID) { - request_status_.Clear(); - request_status_.set_code(RequestStatusCode::INTERNAL); - request_status_.set_msg("status request did not return status"); - } - - // If request has failing HTTP status or the request's explicit - // status is not SUCCESS, then signal an error. - if ( - (http_code != 200) || - (request_status_.code() != RequestStatusCode::SUCCESS)) { - return Error(request_status_); - } - - // Parse the response as a ModelConfigList... - if (!server_status->ParseFromString(response_)) { - return Error(RequestStatusCode::INTERNAL, "failed to parse server status"); - } - - if (verbose_) { - std::cout << server_status->DebugString() << std::endl; - } - - return Error(request_status_); -} - -size_t -ServerStatusHttpContext::ResponseHeaderHandler( - void* contents, size_t size, size_t nmemb, void* userp) -{ - ServerStatusHttpContext* ctx = - reinterpret_cast(userp); - - char* buf = reinterpret_cast(contents); - size_t byte_size = size * nmemb; - - size_t idx = strlen(kStatusHTTPHeader); - if ((idx < byte_size) && !strncasecmp(buf, kStatusHTTPHeader, idx)) { - while ((idx < byte_size) && (buf[idx] != ':')) { - ++idx; - } - - if (idx < byte_size) { - std::string hdr(buf + idx + 1, byte_size - idx - 1); - - if (!google::protobuf::TextFormat::ParseFromString( - hdr, &ctx->request_status_)) { - ctx->request_status_.Clear(); - } - } - } - - return byte_size; -} - -size_t -ServerStatusHttpContext::ResponseHandler( - void* contents, size_t size, size_t nmemb, void* userp) -{ - ServerStatusHttpContext* ctx = - reinterpret_cast(userp); - uint8_t* buf = reinterpret_cast(contents); - size_t result_bytes = size * nmemb; - std::copy(buf, buf + result_bytes, std::back_inserter(ctx->response_)); - return result_bytes; -} - -//============================================================================== - -class HttpRequestImpl : public RequestImpl { - public: - HttpRequestImpl( - const uint64_t id, - const std::vector> inputs); - - ~HttpRequestImpl(); - - // Initialize the request for HTTP transfer on top of - // RequestImpl.InitializeRequestedResults() - Error InitializeRequest( - const std::vector>& requested_outputs, - const size_t batch_size); - - // Copy into 'buf' up to 'size' bytes of input data. Return the - // actual amount copied in 'input_bytes'. - Error GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes); - - // Copy into the context 'size' bytes of result data from - // 'buf'. Return the actual amount copied in 'result_>* results) override; - - private: - friend class InferHttpContext; - - // Pointer to easy handle that is processing the request - CURL* easy_handle_; - - // Pointer to the list of the HTTP request header, keep it such that it will - // be valid during the transfer and can be freed once transfer is completed. - struct curl_slist* header_list_; - - // Status code for the HTTP request. - CURLcode http_status_; - - // RequestStatus received in server response. - RequestStatus request_status_; - - // Buffer that accumulates the serialized InferResponseHeader at the - // end of the body. - std::string infer_response_buffer_; - - // The inputs for the request. For asynchronous request, it should - // be a deep copy of the inputs set by the user in case the user modifies - // them for another request during the HTTP transfer. - std::vector> inputs_; - - // Current positions within input vectors when sending request. - size_t input_pos_idx_; -}; - -HttpRequestImpl::HttpRequestImpl( - const uint64_t id, - const std::vector> inputs) - : RequestImpl(id), easy_handle_(curl_easy_init()), header_list_(NULL), - inputs_(inputs), input_pos_idx_(0) -{ - if (easy_handle_ != NULL) { - run_index_ = reinterpret_cast(easy_handle_); - } -} - -HttpRequestImpl::~HttpRequestImpl() -{ - if (easy_handle_ != NULL) { - curl_easy_cleanup(easy_handle_); - } -} - -Error -HttpRequestImpl::InitializeRequest( - const std::vector>& requested_outputs, - const size_t batch_size) -{ - infer_response_buffer_.clear(); - - // Reset all the position indicators so that we send all inputs - // correctly. - request_status_.Clear(); - - for (auto& io : inputs_) { - reinterpret_cast(io.get())->PrepareForRequest(); - } - - input_pos_idx_ = 0; - result_pos_idx_ = 0; - - return RequestImpl::InitializeRequestedResults(requested_outputs, batch_size); -} - - -Error -HttpRequestImpl::GetNextInput(uint8_t* buf, size_t size, size_t* input_bytes) -{ - *input_bytes = 0; - - while ((size > 0) && (input_pos_idx_ < inputs_.size())) { - InputImpl* io = reinterpret_cast(inputs_[input_pos_idx_].get()); - size_t ib = 0; - bool eoi = false; - Error err = io->GetNext(buf, size, &ib, &eoi); - if (!err.IsOk()) { - return err; - } - - // If input was completely read then move to the next. - if (eoi) { - input_pos_idx_++; - } - if (ib != 0) { - *input_bytes += ib; - size -= ib; - buf += ib; - } - } - - // Sent all input bytes - if (input_pos_idx_ >= inputs_.size()) { - timer_.Record(InferContext::RequestTimers::Kind::SEND_END); - } - - return Error::Success; -} - -Error -HttpRequestImpl::SetNextRawResult( - const uint8_t* buf, size_t size, size_t* result_bytes) -{ - *result_bytes = 0; - - while ((size > 0) && (result_pos_idx_ < requested_results_.size())) { - ResultImpl* io = - reinterpret_cast(requested_results_[result_pos_idx_].get()); - size_t ob = 0; - - // Only try to read raw result for RAW - if (io->ResultFormat() == InferContext::Result::ResultFormat::RAW) { - Error err = io->SetNextRawResult(buf, size, &ob); - if (!err.IsOk()) { - return err; - } - } - - // If output couldn't accept any more bytes then move to the next. - if (ob == 0) { - result_pos_idx_++; - } else { - *result_bytes += ob; - size -= ob; - buf += ob; - } - } - - // If there is any bytes left then they belong to the response - // header, since all the RAW results have been filled. - if (size > 0) { - infer_response_buffer_.append(reinterpret_cast(buf), size); - *result_bytes += size; - } - - return Error::Success; -} - -Error -HttpRequestImpl::GetResults( - std::vector>* results) -{ - InferResponseHeader infer_response; - - if (http_status_ != CURLE_OK) { - curl_slist_free_all(header_list_); - requested_results_.clear(); - return Error( - RequestStatusCode::INTERNAL, - "HTTP client failed: " + std::string(curl_easy_strerror(http_status_))); - } - - // Must use 64-bit integer with curl_easy_getinfo - int64_t http_code; - curl_easy_getinfo(easy_handle_, CURLINFO_RESPONSE_CODE, &http_code); - - curl_slist_free_all(header_list_); - - // Should have a request status, if not then create an error status. - if (request_status_.code() == RequestStatusCode::INVALID) { - request_status_.Clear(); - request_status_.set_code(RequestStatusCode::INTERNAL); - request_status_.set_msg("infer request did not return status"); - } - - // If request has failing HTTP status or the request's explicit - // status is not SUCCESS, then signal an error. - if ( - (http_code != 200) || - (request_status_.code() != RequestStatusCode::SUCCESS)) { - requested_results_.clear(); - return Error(request_status_); - } - - // The infer response header should be available... - if (infer_response_buffer_.empty()) { - requested_results_.clear(); - return Error( - RequestStatusCode::INTERNAL, - "infer request did not return result header"); - } - - infer_response.ParseFromString(infer_response_buffer_); - - PostRunProcessing(requested_results_, infer_response); - - results->swap(requested_results_); - - return Error(request_status_); -} - -//============================================================================== - -Error -InferHttpContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, int model_version, bool verbose) -{ - InferHttpContext* ctx_ptr = - new InferHttpContext(server_url, model_name, model_version, verbose); - - // Get status of the model and create the inputs and outputs. - std::unique_ptr sctx; - Error err = - ServerStatusHttpContext::Create(&sctx, server_url, model_name, verbose); - if (err.IsOk()) { - ServerStatus server_status; - err = sctx->GetServerStatus(&server_status); - if (err.IsOk()) { - const auto& itr = server_status.model_status().find(model_name); - if (itr == server_status.model_status().end()) { - err = Error( - RequestStatusCode::INTERNAL, - "unable to find status information for \"" + model_name + "\""); - } else { - const ModelConfig& model_info = itr->second.config(); - - ctx_ptr->max_batch_size_ = - static_cast(std::max(0, model_info.max_batch_size())); - - // Create inputs and outputs - for (const auto& io : model_info.input()) { - ctx_ptr->inputs_.emplace_back(std::make_shared(io)); - } - for (const auto& io : model_info.output()) { - ctx_ptr->outputs_.emplace_back(std::make_shared(io)); - } - } - } - } - - // Create request context for synchronous request. - ctx_ptr->sync_request_.reset( - static_cast(new HttpRequestImpl(0, ctx_ptr->inputs_))); - - if (err.IsOk()) { - ctx->reset(static_cast(ctx_ptr)); - } else { - ctx->reset(); - } - - return err; -} - -InferHttpContext::InferHttpContext( - const std::string& server_url, const std::string& model_name, - int model_version, bool verbose) - : InferContext(model_name, model_version, verbose), - multi_handle_(curl_multi_init()) -{ - // Process url for HTTP request - // URL doesn't contain the version portion if using the latest version. - url_ = server_url + "/" + kInferRESTEndpoint + "/" + model_name; - if (model_version_ >= 0) { - url_ += "/" + std::to_string(model_version_); - } -} - -InferHttpContext::~InferHttpContext() -{ - exiting_ = true; - // thread not joinable if AsyncRun() is not called - // (it is default constructed thread before the first AsyncRun() call) - if (worker_.joinable()) { - cv_.notify_all(); - worker_.join(); - } - - if (multi_handle_ != NULL) { - for (auto& request : ongoing_async_requests_) { - CURL* easy_handle = - std::static_pointer_cast(request.second)->easy_handle_; - // Just remove, easy_cleanup will be done in ~HttpRequestImpl() - curl_multi_remove_handle(multi_handle_, easy_handle); - } - curl_multi_cleanup(multi_handle_); - } -} - -Error -InferHttpContext::Run(std::vector>* results) -{ - std::shared_ptr sync_request = - std::static_pointer_cast(sync_request_); - - if (!curl_global.Status().IsOk()) { - return curl_global.Status(); - } - - Error err = PreRunProcessing(sync_request_); - - if (!err.IsOk()) { - return err; - } - - // Take run time - sync_request->timer_.Reset(); - sync_request->timer_.Record(RequestTimers::Kind::REQUEST_START); - sync_request->timer_.Record(RequestTimers::Kind::SEND_START); - sync_request->http_status_ = curl_easy_perform(sync_request->easy_handle_); - sync_request->timer_.Record(RequestTimers::Kind::RECEIVE_END); - sync_request->timer_.Record(RequestTimers::Kind::REQUEST_END); - - err = UpdateStat(sync_request->timer_); - if (!err.IsOk()) { - std::cerr << "Failed to update context stat: " << err << std::endl; - } - return sync_request->GetResults(results); -} - -Error -InferHttpContext::AsyncRun(std::shared_ptr* async_request) -{ - if (!multi_handle_) { - return Error( - RequestStatusCode::INTERNAL, "failed to start HTTP asynchronous client"); - } else if (exiting_) { - // abusing variable here, exiting_ is true either when destructor is called - // or the worker thread is not acutally created. - exiting_ = false; - worker_ = std::thread(&InferHttpContext::AsyncTransfer, this); - } - - // Make a copy of the current inputs - std::vector> inputs; - for (const auto& io : inputs_) { - InputImpl* input = reinterpret_cast(io.get()); - inputs.emplace_back(std::make_shared(*input)); - } - - HttpRequestImpl* current_context = - new HttpRequestImpl(async_request_id_++, inputs); - async_request->reset(static_cast(current_context)); - - if (!current_context->easy_handle_) { - return Error( - RequestStatusCode::INTERNAL, "failed to initialize HTTP client"); - } - - Error err = PreRunProcessing(*async_request); - - { - std::lock_guard lock(mutex_); - - auto insert_result = ongoing_async_requests_.emplace(std::make_pair( - reinterpret_cast(current_context->easy_handle_), - *async_request)); - - if (!insert_result.second) { - return Error( - RequestStatusCode::INTERNAL, - "Failed to insert new asynchronous request context."); - } - - curl_multi_add_handle(multi_handle_, current_context->easy_handle_); - current_context->timer_.Reset(); - current_context->timer_.Record(RequestTimers::Kind::REQUEST_START); - current_context->timer_.Record(RequestTimers::Kind::SEND_START); - } - - cv_.notify_all(); - return Error(RequestStatusCode::SUCCESS); -} - -Error -InferHttpContext::GetAsyncRunResults( - std::vector>* results, - const std::shared_ptr& async_request, bool wait) -{ - Error err = IsRequestReady(async_request, wait); - if (!err.IsOk()) { - return err; - } - std::shared_ptr http_request = - std::static_pointer_cast(async_request); - - { - std::lock_guard lock(mutex_); - curl_multi_remove_handle(multi_handle_, http_request->easy_handle_); - } - - err = UpdateStat(http_request->timer_); - if (!err.IsOk()) { - std::cerr << "Failed to update context stat: " << err << std::endl; - } - return http_request->GetResults(results); -} - -size_t -InferHttpContext::RequestProvider( - void* contents, size_t size, size_t nmemb, void* userp) -{ - HttpRequestImpl* request = reinterpret_cast(userp); - - size_t input_bytes = 0; - Error err = request->GetNextInput( - reinterpret_cast(contents), size * nmemb, &input_bytes); - if (!err.IsOk()) { - std::cerr << "RequestProvider: " << err << std::endl; - return CURL_READFUNC_ABORT; - } - - return input_bytes; -} - -size_t -InferHttpContext::ResponseHeaderHandler( - void* contents, size_t size, size_t nmemb, void* userp) -{ - HttpRequestImpl* request = reinterpret_cast(userp); - char* buf = reinterpret_cast(contents); - size_t byte_size = size * nmemb; - - size_t idx = strlen(kStatusHTTPHeader); - if ((idx < byte_size) && !strncasecmp(buf, kStatusHTTPHeader, idx)) { - while ((idx < byte_size) && (buf[idx] != ':')) { - ++idx; - } - - if (idx < byte_size) { - std::string hdr(buf + idx + 1, byte_size - idx - 1); - if (!google::protobuf::TextFormat::ParseFromString( - hdr, &request->request_status_)) { - request->request_status_.Clear(); - } - } - } - - return byte_size; -} - -size_t -InferHttpContext::ResponseHandler( - void* contents, size_t size, size_t nmemb, void* userp) -{ - HttpRequestImpl* request = reinterpret_cast(userp); - size_t result_bytes = 0; - - if (request->timer_.receive_start_.tv_sec == 0) { - request->timer_.Record(RequestTimers::Kind::RECEIVE_START); - } - - Error err = request->SetNextRawResult( - reinterpret_cast(contents), size * nmemb, &result_bytes); - if (!err.IsOk()) { - std::cerr << "ResponseHandler: " << err << std::endl; - return 0; - } - - return result_bytes; -} - -Error -InferHttpContext::PreRunProcessing(std::shared_ptr& request) -{ - std::shared_ptr http_request = - std::static_pointer_cast(request); - - http_request->InitializeRequest(requested_outputs_, batch_size_); - - CURL* curl = http_request->easy_handle_; - if (!curl) { - return Error( - RequestStatusCode::INTERNAL, "failed to initialize HTTP client"); - } - - std::string full_url = url_ + "?format=binary"; - curl_easy_setopt(curl, CURLOPT_URL, full_url.c_str()); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); - curl_easy_setopt(curl, CURLOPT_POST, 1L); - curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1L); - if (verbose_) { - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); - } - - // request data provided by RequestProvider() - curl_easy_setopt(curl, CURLOPT_READFUNCTION, RequestProvider); - curl_easy_setopt(curl, CURLOPT_READDATA, http_request.get()); - - // response headers handled by ResponseHeaderHandler() - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, ResponseHeaderHandler); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, http_request.get()); - - // response data handled by ResponseHandler() - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, ResponseHandler); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, http_request.get()); - - // set the expected POST size. If you want to POST large amounts of - // data, consider CURLOPT_POSTFIELDSIZE_LARGE - curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, total_input_byte_size_); - - // Headers to specify input and output tensors - infer_request_str_.clear(); - infer_request_str_ = std::string(kInferRequestHTTPHeader) + ":" + - infer_request_.ShortDebugString(); - struct curl_slist* list = NULL; - list = curl_slist_append(list, "Expect:"); - list = curl_slist_append(list, "Content-Type: application/octet-stream"); - list = curl_slist_append(list, infer_request_str_.c_str()); - curl_easy_setopt(curl, CURLOPT_HTTPHEADER, list); - - // The list should be freed after the request - http_request->header_list_ = list; - - return Error::Success; -} - -void -InferHttpContext::AsyncTransfer() -{ - int place_holder = 0; - CURLMsg* msg = NULL; - do { - bool has_completed = false; - // sleep if no work is available - std::unique_lock lock(mutex_); - cv_.wait(lock, [this] { - if (this->exiting_) { - return true; - } - // wake up if at least one request is not ready - for (auto& ongoing_async_request : this->ongoing_async_requests_) { - if ( - std::static_pointer_cast( - ongoing_async_request.second) - ->ready_ == false) { - return true; - } - } - return false; - }); - curl_multi_perform(multi_handle_, &place_holder); - while ((msg = curl_multi_info_read(multi_handle_, &place_holder))) { - // update request status - uintptr_t identifier = reinterpret_cast(msg->easy_handle); - auto itr = ongoing_async_requests_.find(identifier); - // This shouldn't happen - if (itr == ongoing_async_requests_.end()) { - fprintf( - stderr, - "Unexpected error: received completed request that" - " is not in the list of asynchronous requests.\n"); - curl_multi_remove_handle(multi_handle_, msg->easy_handle); - curl_easy_cleanup(msg->easy_handle); - continue; - } - std::shared_ptr http_request = - std::static_pointer_cast(itr->second); - - if (msg->msg != CURLMSG_DONE) { - // Something wrong happened. - fprintf(stderr, "Unexpected error: received CURLMsg=%d\n", msg->msg); - } else { - http_request->timer_.Record(RequestTimers::Kind::RECEIVE_END); - http_request->timer_.Record(RequestTimers::Kind::REQUEST_END); - } - http_request->http_status_ = msg->data.result; - http_request->ready_ = true; - has_completed = true; - } - lock.unlock(); - // if it has completed tasks, send signal in case the main thread is waiting - if (has_completed) { - cv_.notify_all(); - } - } while (!exiting_); -} - -//============================================================================== - -Error -ProfileHttpContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose) -{ - ctx->reset( - static_cast(new ProfileHttpContext(server_url, verbose))); - return Error::Success; -} - -ProfileHttpContext::ProfileHttpContext( - const std::string& server_url, bool verbose) - : ProfileContext(verbose), url_(server_url + "/" + kProfileRESTEndpoint) -{ -} - -Error -ProfileHttpContext::SendCommand(const std::string& cmd_str) -{ - request_status_.Clear(); - - if (!curl_global.Status().IsOk()) { - return curl_global.Status(); - } - - CURL* curl = curl_easy_init(); - if (!curl) { - return Error( - RequestStatusCode::INTERNAL, "failed to initialize HTTP client"); - } - - // Want binary representation of the status. - std::string full_url = url_ + "?cmd=" + cmd_str; - curl_easy_setopt(curl, CURLOPT_URL, full_url.c_str()); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0"); - if (verbose_) { - curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); - } - - // response headers handled by ResponseHeaderHandler() - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, ResponseHeaderHandler); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, this); - - CURLcode res = curl_easy_perform(curl); - if (res != CURLE_OK) { - curl_easy_cleanup(curl); - return Error( - RequestStatusCode::INTERNAL, - "HTTP client failed: " + std::string(curl_easy_strerror(res))); - } - - // Must use 64-bit integer with curl_easy_getinfo - int64_t http_code; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); - - curl_easy_cleanup(curl); - - // Should have a request status, if not then create an error status. - if (request_status_.code() == RequestStatusCode::INVALID) { - request_status_.Clear(); - request_status_.set_code(RequestStatusCode::INTERNAL); - request_status_.set_msg("profile request did not return status"); - } - - return Error(request_status_); -} - -size_t -ProfileHttpContext::ResponseHeaderHandler( - void* contents, size_t size, size_t nmemb, void* userp) -{ - ProfileHttpContext* ctx = reinterpret_cast(userp); - - char* buf = reinterpret_cast(contents); - size_t byte_size = size * nmemb; - - size_t idx = strlen(kStatusHTTPHeader); - if ((idx < byte_size) && !strncasecmp(buf, kStatusHTTPHeader, idx)) { - while ((idx < byte_size) && (buf[idx] != ':')) { - ++idx; - } - - if (idx < byte_size) { - std::string hdr(buf + idx + 1, byte_size - idx - 1); - - if (!google::protobuf::TextFormat::ParseFromString( - hdr, &ctx->request_status_)) { - ctx->request_status_.Clear(); - } - } - } - - return byte_size; -} - -//============================================================================== - -Error -ServerHealthGrpcContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose) -{ - ctx->reset(static_cast( - new ServerHealthGrpcContext(server_url, verbose))); - return Error::Success; -} - -ServerHealthGrpcContext::ServerHealthGrpcContext( - const std::string& server_url, bool verbose) - : ServerHealthContext(verbose), - stub_(GRPCService::NewStub(GetChannel(server_url))) -{ -} - -Error -ServerHealthGrpcContext::GetHealth(const std::string& mode, bool* health) -{ - Error err; - - HealthRequest request; - HealthResponse response; - grpc::ClientContext context; - - request.set_mode(mode); - grpc::Status grpc_status = stub_->Health(&context, request, &response); - if (grpc_status.ok()) { - *health = response.health(); - err = Error(response.request_status()); - } else { - // Something wrong with the gRPC conncection - err = Error( - RequestStatusCode::INTERNAL, - "gRPC client failed: " + std::to_string(grpc_status.error_code()) + ": " + - grpc_status.error_message()); - } - - if (verbose_ && err.IsOk()) { - std::cout << mode << ": " << *health << std::endl; - } - - return err; -} - -Error -ServerHealthGrpcContext::GetReady(bool* ready) -{ - return GetHealth("ready", ready); -} - -Error -ServerHealthGrpcContext::GetLive(bool* live) -{ - return GetHealth("live", live); -} - -//============================================================================== - -Error -ServerStatusGrpcContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose) -{ - ctx->reset(static_cast( - new ServerStatusGrpcContext(server_url, verbose))); - return Error::Success; -} - -Error -ServerStatusGrpcContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, bool verbose) -{ - ctx->reset(static_cast( - new ServerStatusGrpcContext(server_url, model_name, verbose))); - return Error::Success; -} - -ServerStatusGrpcContext::ServerStatusGrpcContext( - const std::string& server_url, bool verbose) - : ServerStatusContext(verbose), model_name_(""), - stub_(GRPCService::NewStub(GetChannel(server_url))) -{ -} - -ServerStatusGrpcContext::ServerStatusGrpcContext( - const std::string& server_url, const std::string& model_name, bool verbose) - : ServerStatusContext(verbose), model_name_(model_name), - stub_(GRPCService::NewStub(GetChannel(server_url))) -{ -} - -Error -ServerStatusGrpcContext::GetServerStatus(ServerStatus* server_status) -{ - server_status->Clear(); - - Error grpc_status; - - StatusRequest request; - StatusResponse response; - grpc::ClientContext context; - - request.set_model_name(model_name_); - grpc::Status status = stub_->Status(&context, request, &response); - if (status.ok()) { - server_status->Swap(response.mutable_server_status()); - grpc_status = Error(response.request_status()); - } else { - // Something wrong with the gRPC conncection - grpc_status = Error( - RequestStatusCode::INTERNAL, - "gRPC client failed: " + std::to_string(status.error_code()) + ": " + - status.error_message()); - } - - // Log server status if request is SUCCESS and verbose is true. - if (grpc_status.Code() == RequestStatusCode::SUCCESS && verbose_) { - std::cout << server_status->DebugString() << std::endl; - } - return grpc_status; -} - -//============================================================================== - -class GrpcRequestImpl : public RequestImpl { - public: - GrpcRequestImpl(const uint64_t id, const uintptr_t run_index); - - // @see RequestImpl.GetResults() - Error GetResults( - std::vector>* results) override; - - private: - // Unmarshall and process 'grpc_response_' into 'requested_results' - Error SetRawResult(); - - friend class InferGrpcContext; - - // Variables for gRPC call - grpc::ClientContext grpc_context_; - grpc::Status grpc_status_; - InferResponse grpc_response_; -}; - -GrpcRequestImpl::GrpcRequestImpl(const uint64_t id, const uintptr_t run_index) - : RequestImpl(id) -{ - run_index_ = run_index; -} - -Error -GrpcRequestImpl::SetRawResult() -{ - result_pos_idx_ = 0; - for (std::string output : grpc_response_.raw_output()) { - const uint8_t* buf = reinterpret_cast(&output[0]); - size_t size = output.size(); - size_t result_bytes = 0; - - // Not using loop as in HTTP Infer because the output size should match - if ((size > 0) && (result_pos_idx_ < requested_results_.size())) { - ResultImpl* io = reinterpret_cast( - requested_results_[result_pos_idx_].get()); - - // Only try to read raw result for RAW - if (io->ResultFormat() == InferContext::Result::ResultFormat::RAW) { - Error err = io->SetNextRawResult(buf, size, &result_bytes); - if (!err.IsOk()) { - return err; - } - } - } - - if (result_bytes != size) { - return Error( - RequestStatusCode::INVALID, - "Written bytes doesn't match received bytes."); - } - - result_pos_idx_++; - } - - return Error::Success; -} - -Error -GrpcRequestImpl::GetResults( - std::vector>* results) -{ - results->clear(); - InferResponseHeader infer_response; - - Error err(RequestStatusCode::SUCCESS); - if (grpc_status_.ok()) { - infer_response.Swap(grpc_response_.mutable_meta_data()); - err = Error(grpc_response_.request_status()); - if (err.IsOk()) { - Error set_err = SetRawResult(); - if (!set_err.IsOk()) { - return set_err; - } - } - } else { - // Something wrong with the gRPC conncection - err = Error( - RequestStatusCode::INTERNAL, - "gRPC client failed: " + std::to_string(grpc_status_.error_code()) + - ": " + grpc_status_.error_message()); - } - - // Only continue to process result if gRPC status is SUCCESS - if (err.Code() == RequestStatusCode::SUCCESS) { - PostRunProcessing(requested_results_, infer_response); - results->swap(requested_results_); - } - - return err; -} - -//============================================================================== - -Error -InferGrpcContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, int model_version, bool verbose) -{ - InferGrpcContext* ctx_ptr = - new InferGrpcContext(server_url, model_name, model_version, verbose); - - // Create request context for synchronous request. - ctx_ptr->sync_request_.reset( - static_cast(new GrpcRequestImpl(0, 0))); - - // Get status of the model and create the inputs and outputs. - std::unique_ptr sctx; - Error err = - ServerStatusGrpcContext::Create(&sctx, server_url, model_name, verbose); - if (err.IsOk()) { - ServerStatus server_status; - err = sctx->GetServerStatus(&server_status); - if (err.IsOk()) { - const auto& itr = server_status.model_status().find(model_name); - if (itr == server_status.model_status().end()) { - err = Error( - RequestStatusCode::INTERNAL, - "unable to find status information for \"" + model_name + "\""); - } else { - const ModelConfig& model_info = itr->second.config(); - - ctx_ptr->max_batch_size_ = - static_cast(std::max(0, model_info.max_batch_size())); - - // Create inputs and outputs - for (const auto& io : model_info.input()) { - ctx_ptr->inputs_.emplace_back(std::make_shared(io)); - } - for (const auto& io : model_info.output()) { - ctx_ptr->outputs_.emplace_back(std::make_shared(io)); - } - } - } - } - - if (err.IsOk()) { - ctx->reset(static_cast(ctx_ptr)); - } else { - ctx->reset(); - } - - return err; -} - -InferGrpcContext::InferGrpcContext( - const std::string& server_url, const std::string& model_name, - int model_version, bool verbose) - : InferContext(model_name, model_version, verbose), - stub_(GRPCService::NewStub(GetChannel(server_url))) -{ -} - -InferGrpcContext::~InferGrpcContext() -{ - exiting_ = true; - // thread not joinable if AsyncRun() is not called - // (it is default constructed thread before the first AsyncRun() call) - if (worker_.joinable()) { - cv_.notify_all(); - worker_.join(); - } - - // Close complete queue and drain its content - async_request_completion_queue_.Shutdown(); - bool has_next = true; - void* tag; - bool ok; - do { - has_next = async_request_completion_queue_.Next(&tag, &ok); - } while (has_next); -} - -Error -InferGrpcContext::Run(std::vector>* results) -{ - grpc::ClientContext context; - - std::shared_ptr sync_request = - std::static_pointer_cast(sync_request_); - sync_request->InitializeRequestedResults(requested_outputs_, batch_size_); - - sync_request->timer_.Reset(); - // Use send timer to measure time for marshalling infer request - sync_request->timer_.Record(RequestTimers::Kind::SEND_START); - PreRunProcessing(sync_request_); - sync_request->timer_.Record(RequestTimers::Kind::SEND_END); - - sync_request->timer_.Record(RequestTimers::Kind::REQUEST_START); - sync_request->grpc_status_ = - stub_->Infer(&context, request_, &sync_request->grpc_response_); - sync_request->timer_.Record(RequestTimers::Kind::REQUEST_END); - - sync_request->timer_.Record(RequestTimers::Kind::RECEIVE_START); - Error request_status = sync_request->GetResults(results); - sync_request->timer_.Record(RequestTimers::Kind::RECEIVE_END); - - Error err = UpdateStat(sync_request->timer_); - if (!err.IsOk()) { - std::cerr << "Failed to update context stat: " << err << std::endl; - } - - return request_status; -} - -Error -InferGrpcContext::AsyncRun(std::shared_ptr* async_request) -{ - if (exiting_) { - exiting_ = false; - worker_ = std::thread(&InferGrpcContext::AsyncTransfer, this); - } - uintptr_t run_index; - if (reusable_slot_.empty()) { - run_index = ongoing_async_requests_.size(); - } else { - run_index = reusable_slot_.back(); - reusable_slot_.pop_back(); - } - - GrpcRequestImpl* current_context = - new GrpcRequestImpl(async_request_id_++, run_index); - async_request->reset(static_cast(current_context)); - - auto insert_result = - ongoing_async_requests_.emplace(std::make_pair(run_index, *async_request)); - - if (!insert_result.second) { - return Error( - RequestStatusCode::INTERNAL, - "Failed to insert new asynchronous request context."); - } - - current_context->timer_.Reset(); - current_context->timer_.Record(RequestTimers::Kind::SEND_START); - PreRunProcessing(*async_request); - current_context->timer_.Record(RequestTimers::Kind::SEND_END); - - current_context->timer_.Record(RequestTimers::Kind::REQUEST_START); - std::unique_ptr> rpc( - stub_->PrepareAsyncInfer( - ¤t_context->grpc_context_, request_, - &async_request_completion_queue_)); - - rpc->StartCall(); - - rpc->Finish( - ¤t_context->grpc_response_, ¤t_context->grpc_status_, - (void*)run_index); - - cv_.notify_all(); - return Error(RequestStatusCode::SUCCESS); -} - -Error -InferGrpcContext::GetAsyncRunResults( - std::vector>* results, - const std::shared_ptr& async_request, bool wait) -{ - Error err = IsRequestReady(async_request, wait); - if (!err.IsOk()) { - return err; - } - - std::shared_ptr grpc_request = - std::static_pointer_cast(async_request); - - reusable_slot_.push_back(grpc_request->run_index_); - grpc_request->timer_.Record(RequestTimers::Kind::RECEIVE_START); - Error request_status = grpc_request->GetResults(results); - grpc_request->timer_.Record(RequestTimers::Kind::RECEIVE_END); - err = UpdateStat(grpc_request->timer_); - if (!err.IsOk()) { - std::cerr << "Failed to update context stat: " << err << std::endl; - } - return request_status; -} - -Error -InferGrpcContext::PreRunProcessing(std::shared_ptr& request) -{ - std::shared_ptr grpc_request = - std::static_pointer_cast(request); - grpc_request->InitializeRequestedResults(requested_outputs_, batch_size_); - - for (auto& io : inputs_) { - reinterpret_cast(io.get())->PrepareForRequest(); - } - - request_.Clear(); - request_.set_model_name(model_name_); - request_.set_version(model_version_); - request_.mutable_meta_data()->MergeFrom(infer_request_); - - size_t input_pos_idx = 0; - while (input_pos_idx < inputs_.size()) { - InputImpl* io = reinterpret_cast(inputs_[input_pos_idx].get()); - std::string* new_input = request_.add_raw_input(); - // Append all batches of one input together - for (size_t batch_idx = 0; batch_idx < batch_size_; batch_idx++) { - const uint8_t* data_ptr; - io->GetRaw(batch_idx, &data_ptr); - new_input->append( - reinterpret_cast(data_ptr), io->ByteSize()); - } - input_pos_idx++; - } - return Error::Success; -} - -void -InferGrpcContext::AsyncTransfer() -{ - do { - // sleep if no work is available - std::unique_lock lock(mutex_); - cv_.wait(lock, [this] { - if (this->exiting_) { - return true; - } - // wake up if at least one request is not ready - for (auto& ongoing_async_request : this->ongoing_async_requests_) { - if ( - std::static_pointer_cast( - ongoing_async_request.second) - ->ready_ == false) { - return true; - } - } - return false; - }); - lock.unlock(); - // gRPC async APIs are thread-safe https://github.com/grpc/grpc/issues/4486 - if (!exiting_) { - size_t got; - bool ok = true; - bool status = async_request_completion_queue_.Next((void**)(&got), &ok); - { - std::lock_guard lock(mutex_); - if (!ok) { - fprintf(stderr, "Unexpected not ok on client side."); - } - if (!status) { - fprintf(stderr, "Completion queue is closed."); - } - auto itr = ongoing_async_requests_.find(got); - if (itr == ongoing_async_requests_.end()) { - fprintf( - stderr, - "Unexpected error: received completed request that" - " is not in the list of asynchronous requests.\n"); - continue; - } - - std::shared_ptr grpc_request = - std::static_pointer_cast(itr->second); - grpc_request->timer_.Record(RequestTimers::Kind::REQUEST_END); - grpc_request->ready_ = true; - } - // send signal in case the main thread is waiting - cv_.notify_all(); - } - } while (!exiting_); -} - -//============================================================================== - -Error -ProfileGrpcContext::Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose) -{ - ctx->reset( - static_cast(new ProfileGrpcContext(server_url, verbose))); - return Error::Success; -} - -ProfileGrpcContext::ProfileGrpcContext( - const std::string& server_url, bool verbose) - : ProfileContext(verbose), - stub_(GRPCService::NewStub(GetChannel(server_url))) -{ -} - -Error -ProfileGrpcContext::SendCommand(const std::string& cmd_str) -{ - ProfileRequest request; - ProfileResponse response; - grpc::ClientContext context; - - request.set_cmd(cmd_str); - grpc::Status status = stub_->Profile(&context, request, &response); - if (status.ok()) { - return Error(response.request_status()); - } else { - // Something wrong with the gRPC conncection - return Error( - RequestStatusCode::INTERNAL, - "gRPC client failed: " + std::to_string(status.error_code()) + ": " + - status.error_message()); - } -} - -}}} // namespace nvidia::inferenceserver::client diff --git a/src/clients/c++/request.h b/src/clients/c++/request.h deleted file mode 100644 index c05a654dba..0000000000 --- a/src/clients/c++/request.h +++ /dev/null @@ -1,1077 +0,0 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once - -/// \file - -#include -#include -#include -#include -#include -#include -#include -#include -#include "src/core/api.pb.h" -#include "src/core/grpc_service.grpc.pb.h" -#include "src/core/grpc_service.pb.h" -#include "src/core/model_config.h" -#include "src/core/model_config.pb.h" -#include "src/core/request_status.pb.h" -#include "src/core/server_status.pb.h" - -namespace nvidia { namespace inferenceserver { namespace client { - -//============================================================================== -/// Error status reported by client API. -/// -class Error { - public: - /// Create an error from a RequestStatus. - /// \param status The RequestStatus object - explicit Error(const RequestStatus& status); - - /// Create an error from a RequestStatusCode. - /// \param code The status code for the error - explicit Error(RequestStatusCode code = RequestStatusCode::SUCCESS); - - /// Create an error from a RequestStatusCode and a detailed message. - /// \param code The status code for the error - /// \param msg The detailed message for the error - explicit Error(RequestStatusCode code, const std::string& msg); - - /// Accessor for the RequestStatusCode of this error. - /// \return The RequestStatusCode for the error. - RequestStatusCode Code() const { return code_; } - - /// Accessor for the message of this error. - /// \return The detailed messsage for the error. Empty if no - /// detailed message. - const std::string& Message() const { return msg_; } - - /// Accessor for the ID of the inference server associated with this - /// error. - /// \return The ID of the inference server associated with this - /// error, or empty-string if no inference server is associated with - /// the error. - const std::string& ServerId() const { return server_id_; } - - /// Accessor for the ID of the request associated with this error. - /// \return The ID of the request associated with this error, or 0 - /// (zero) if no request ID is associated with the error. - uint64_t RequestId() const { return request_id_; } - - /// Does this error indicate OK status? - /// \return True if this error indicates "ok"/"success", false if - /// error indicates a failure. - bool IsOk() const { return code_ == RequestStatusCode::SUCCESS; } - - /// Convenience "success" value. Can be used as Error::Success to - /// indicate no error. - static const Error Success; - - private: - friend std::ostream& operator<<(std::ostream&, const Error&); - RequestStatusCode code_; - std::string msg_; - std::string server_id_; - uint64_t request_id_; -}; - -//============================================================================== -/// A ServerHealthContext object is used to query an inference server -/// for health information. Once created a ServerHealthContext object -/// can be used repeatedly to get health from the server. A -/// ServerHealthContext object can use either HTTP protocol or gRPC -/// protocol depending on the Create function -/// (ServerHealthHttpContext::Create or -/// ServerHealthGrpcContext::Create). For example: -/// -/// \code -/// std::unique_ptr ctx; -/// ServerHealthHttpContext::Create(&ctx, "localhost:8000"); -/// bool ready; -/// ctx->GetReady(&ready); -/// ... -/// bool live; -/// ctx->GetLive(&live); -/// ... -/// \endcode -/// -/// \note -/// ServerHealthContext::Create methods are thread-safe. -/// GetReady() and GetLive() are not thread-safe. For a given -/// ServerHealthContext, calls to GetReady() and GetLive() must be -/// serialized. -/// -class ServerHealthContext { - public: - /// Contact the inference server and get readiness state. - /// \param ready Returns the readiness state of the server. - /// \return Error object indicating success or failure of the request. - virtual Error GetReady(bool* ready) = 0; - - /// Contact the inference server and get liveness state. - /// \param ready Returns the liveness state of the server. - /// \return Error object indicating success or failure of the request. - virtual Error GetLive(bool* live) = 0; - - protected: - ServerHealthContext(bool); - - // If true print verbose output - const bool verbose_; -}; - -//============================================================================== -/// A ServerStatusContext object is used to query an inference server -/// for status information, including information about the models -/// available on that server. Once created a ServerStatusContext object -/// can be used repeatedly to get status from the server. -/// A ServerStatusContext object can use either HTTP protocol or gRPC protocol -/// depending on the Create function (ServerStatusHttpContext::Create or -/// ServerStatusGrpcContext::Create). For example: -/// -/// \code -/// std::unique_ptr ctx; -/// ServerStatusHttpContext::Create(&ctx, "localhost:8000"); -/// ServerStatus status; -/// ctx->GetServerStatus(&status); -/// ... -/// ctx->GetServerStatus(&status); -/// ... -/// \endcode -/// -/// \note -/// ServerStatusContext::Create methods are thread-safe. -/// GetServerStatus() is not thread-safe. For a given -/// ServerStatusContext, calls to GetServerStatus() must be -/// serialized. -/// -class ServerStatusContext { - public: - /// Contact the inference server and get status. - /// \param status Returns the status. - /// \return Error object indicating success or failure of the request. - virtual Error GetServerStatus(ServerStatus* status) = 0; - - protected: - ServerStatusContext(bool); - - // If true print verbose output - const bool verbose_; -}; - -//============================================================================== -/// An InferContext object is used to run inference on an inference -/// server for a specific model. Once created an InferContext object -/// can be used repeatedly to perform inference using the -/// model. Options that control how inference is performed can be -/// changed in between inference runs. -/// -/// A InferContext object can use either HTTP protocol or gRPC protocol -/// depending on the Create function (InferHttpContext::Create or -/// InferGrpcContext::Create). For example: -/// -/// \code -/// std::unique_ptr ctx; -/// InferHttpContext::Create(&ctx, "localhost:8000", "mnist"); -/// ... -/// std::unique_ptr options0; -/// Options::Create(&options0); -/// options->SetBatchSize(b); -/// options->AddClassResult(output, topk); -/// ctx->SetRunOptions(*options0); -/// ... -/// ctx->Run(&results0); // run using options0 -/// ctx->Run(&results1); // run using options0 -/// ... -/// std::unique_ptr options1; -/// Options::Create(&options1); -/// options->AddRawResult(output); -/// ctx->SetRunOptions(*options); -/// ... -/// ctx->Run(&results2); // run using options1 -/// ctx->Run(&results3); // run using options1 -/// ... -/// \endcode -/// -/// \note -/// InferContext::Create methods are thread-safe. -/// All other InferContext methods, and nested class methods are not -/// thread-safe. -/// \par -/// The Run() calls are not thread-safe but a new Run() can -/// be invoked as soon as the previous completes. The returned result -/// objects are owned by the caller and may be retained and accessed -/// even after the InferContext object is destroyed. -/// \par -/// AsyncRun() and GetAsyncRunStatus() calls are not thread-safe. -/// What's more, calling one method while the other one is running -/// will result in undefined behavior given that they will modify the -/// shared data internally. -/// \par -/// For more parallelism multiple InferContext objects can access the -/// same inference server with no serialization requirements across -/// those objects. -/// \endcode -/// -class InferContext { - public: - //============== - /// An input to the model. - class Input { - public: - /// Destroy the input. - virtual ~Input(){}; - - /// \return The name of the input. - virtual const std::string& Name() const = 0; - - /// \return The size in bytes of this input. This is the size for - /// one instance of the input, not the entire size of a batched - /// input. - virtual size_t ByteSize() const = 0; - - /// \return The data-type of the input. - virtual DataType DType() const = 0; - - /// \return The format of the input. - virtual ModelInput::Format Format() const = 0; - - /// \return The dimensions/shape of the input. - virtual const DimsList& Dims() const = 0; - - /// Prepare this input to receive new tensor values. Forget any - /// existing values that were set by previous calls to - /// SetRaw(). - /// \return Error object indicating success or failure. - virtual Error Reset() = 0; - - /// Set tensor values for this input from a byte array. The array - /// is not copied and so it must not be modified or destroyed - /// until this input is no longer needed (that is until the Run() - /// call(s) that use the input have completed). For batched inputs - /// this function must be called batch-size times to provide all - /// tensor values for a batch of this input. - /// \param input The pointer to the array holding the tensor value. - /// \param input_byte_size The size of the array in bytes, must match - /// the size expected by the input. - /// \return Error object indicating success or failure. - virtual Error SetRaw(const uint8_t* input, size_t input_byte_size) = 0; - - /// Set tensor values for this input from a byte vector. The vector - /// is not copied and so it must not be modified or destroyed - /// until this input is no longer needed (that is until the Run() - /// call(s) that use the input have completed). For batched inputs - /// this function must be called batch-size times to provide all - /// tensor values for a batch of this input. - /// \param input The vector holding tensor values. - /// \return Error object indicating success or failure. - virtual Error SetRaw(const std::vector& input) = 0; - }; - - //============== - /// An output from the model. - class Output { - public: - /// Destroy the output. - virtual ~Output(){}; - - /// \return The name of the output. - virtual const std::string& Name() const = 0; - - /// \return The size in bytes of this output. This is the size for - /// one instance of the output, not the entire size of a batched - /// input. - virtual size_t ByteSize() const = 0; - - /// \return The data-type of the output. - virtual DataType DType() const = 0; - - /// \return The dimensions/shape of the output. - virtual const DimsList& Dims() const = 0; - }; - - //============== - /// An inference result corresponding to an output. - class Result { - public: - /// Destroy the result. - virtual ~Result(){}; - - /// Format in which result is returned. - enum ResultFormat { - /// RAW format is the entire result tensor of values. - RAW = 0, - - /// CLASS format is the top-k highest probability values of the - /// result and the associated class label (if provided by the - /// model). - CLASS = 1 - }; - - /// \return The name of the model that produced this result. - virtual const std::string& ModelName() const = 0; - - /// \return The version of the model that produced this result. - virtual uint32_t ModelVersion() const = 0; - - /// \return The Output object corresponding to this result. - virtual const std::shared_ptr GetOutput() const = 0; - - /// Get a reference to entire raw result data for a specific batch - /// entry. Returns error if this result is not RAW format. - /// \param batch_idx Returns the results for this entry of the batch. - /// \param buf Returns the vector of result bytes. - /// \return Error object indicating success or failure. - virtual Error GetRaw( - size_t batch_idx, const std::vector** buf) const = 0; - - /// Get a reference to raw result data for a specific batch entry - /// at the current "cursor" and advance the cursor by the specified - /// number of bytes. More typically use GetRawAtCursor() method - /// to return the data as a specific type T. Use ResetCursor() to - /// reset the cursor to the beginning of the result. Returns error - /// if this result is not RAW format. - /// \param batch_idx Returns results for this entry of the batch. - /// \param buf Returns pointer to 'adv_byte_size' bytes of data. - /// \param adv_byte_size The number of bytes of data to get a reference to. - /// \return Error object indicating success or failure. - virtual Error GetRawAtCursor( - size_t batch_idx, const uint8_t** buf, size_t adv_byte_size) = 0; - - /// Read a value for a specific batch entry at the current "cursor" - /// from the result tensor as the specified type T and advance the - /// cursor. Use ResetCursor() to reset the cursor to the beginning - /// of the result. Returns error if this result is not RAW format. - /// \param batch_idx Returns results for this entry of the batch. - /// \param out Returns the value at the cursor. - /// \return Error object indicating success or failure. - template - Error GetRawAtCursor(size_t batch_idx, T* out); - - /// The result value for CLASS format results. - struct ClassResult { - /// The index of the class in the result vector. - size_t idx; - /// The value of the class. - float value; - /// The label for the class, if provided by the model. - std::string label; - }; - - /// Get the number of class results for a batch. Returns error if - /// this result is not CLASS format. - /// \param batch_idx The index in the batch. - /// \param cnt Returns the number of ClassResult entries for the - /// batch entry. - /// \return Error object indicating success or failure. - virtual Error GetClassCount(size_t batch_idx, size_t* cnt) const = 0; - - /// Get the ClassResult result for a specific batch entry at the - /// current cursor. Use ResetCursor() to reset the cursor to the - /// beginning of the result. Returns error if this result is not - /// CLASS format. - /// \param batch_idx The index in the batch. - /// \param result Returns the ClassResult value for the batch at the cursor. - /// \return Error object indicating success or failure. - virtual Error GetClassAtCursor(size_t batch_idx, ClassResult* result) = 0; - - /// Reset cursor to beginning of result for all batch entries. - /// \return Error object indicating success or failure. - virtual Error ResetCursors() = 0; - - /// Reset cursor to beginning of result for specified batch entry. - /// \param batch_idx The index in the batch. - /// \return Error object indicating success or failure. - virtual Error ResetCursor(size_t batch_idx) = 0; - }; - - //============== - /// Run options to be applied to all subsequent Run() invocations. - class Options { - public: - virtual ~Options(){}; - - /// Create a new Options object with default values. - /// \return Error object indicating success or failure. - static Error Create(std::unique_ptr* options); - - /// \return The batch size to use for all subsequent inferences. - virtual size_t BatchSize() const = 0; - - /// Set the batch size to use for all subsequent inferences. - /// \param batch_size The batch size. - virtual void SetBatchSize(size_t batch_size) = 0; - - /// Add 'output' to the list of requested RAW results. Run() will - /// return the output's full tensor as a result. - /// \param output The output. - /// \return Error object indicating success or failure. - virtual Error AddRawResult( - const std::shared_ptr& output) = 0; - - /// Add 'output' to the list of requested CLASS results. Run() will - /// return the highest 'k' values of 'output' as a result. - /// \param output The output. - /// \param k Set how many class results to return for the output. - /// \return Error object indicating success or failure. - virtual Error AddClassResult( - const std::shared_ptr& output, uint64_t k) = 0; - }; - - //============== - /// Handle to a inference request. The request handle is used to get - /// request results if the request is sent by AsyncRun(). - class Request { - public: - /// Destroy the request handle. - virtual ~Request() = default; - - /// \return The unique identifier of the request. - virtual uint64_t Id() const = 0; - }; - - //============== - /// Cumulative statistic of the InferContext. - /// - /// \note - /// For gRPC protocol, 'cumulative_send_time_ns' represents the - /// time for marshaling infer request. - /// 'cumulative_receive_time_ns' represents the time for - /// unmarshaling infer response. - struct Stat { - /// Total number of requests completed. - size_t completed_request_count; - - /// Time from the request start until the response is completely - /// received. - uint64_t cumulative_total_request_time_ns; - - /// Time from the request start until the last byte is sent. - uint64_t cumulative_send_time_ns; - - /// Time from receiving first byte of the response until the - /// response is completely received. - uint64_t cumulative_receive_time_ns; - - /// Create a new Stat object with zero-ed statistics. - Stat() - : completed_request_count(0), cumulative_total_request_time_ns(0), - cumulative_send_time_ns(0), cumulative_receive_time_ns(0) - { - } - }; - - //============== - /// Timer to record the timestamp for different stages of request - /// handling. - class RequestTimers { - public: - /// The kind of the timer. - enum Kind { - /// The start of request handling. - REQUEST_START, - /// The end of request handling. - REQUEST_END, - /// The start of sending request bytes to the server (i.e. first byte). - SEND_START, - /// The end of sending request bytes to the server (i.e. last byte). - SEND_END, - /// The start of receiving response bytes from the server - /// (i.e. first byte). - RECEIVE_START, - /// The end of receiving response bytes from the server - /// (i.e. last byte). - RECEIVE_END - }; - - /// Construct a timer with zero-ed timestamps. - RequestTimers(); - - /// Reset all timestamp values to zero. Must be called before - /// re-using the timer. - /// \return Error object indicating success or failure. - Error Reset(); - - /// Record the current timestamp for a request stage. - /// \param kind The Kind of the timestamp. - /// \return Error object indicating success or failure. - Error Record(Kind kind); - - private: - friend class InferContext; - friend class InferHttpContext; - friend class InferGrpcContext; - struct timespec request_start_; - struct timespec request_end_; - struct timespec send_start_; - struct timespec send_end_; - struct timespec receive_start_; - struct timespec receive_end_; - }; - - public: - /// Destroy the inference context. - virtual ~InferContext() = default; - - /// \return The name of the model being used for this context. - const std::string& ModelName() const { return model_name_; } - - /// \return The version of the model being used for this context. -1 - /// indicates that the latest (i.e. highest version number) version - /// of that model is being used. - int ModelVersion() const { return model_version_; } - - /// \return The maximum batch size supported by the context. A - /// maximum batch size indicates that the context does not support - /// batching and so only a single inference at a time can be - /// performed. - uint64_t MaxBatchSize() const { return max_batch_size_; } - - /// \return The inputs of the model. - const std::vector>& Inputs() const { return inputs_; } - - /// \return The outputs of the model. - const std::vector>& Outputs() const - { - return outputs_; - } - - /// Get a named input. - /// \param name The name of the input. - /// \param input Returns the Input object for 'name'. - /// \return Error object indicating success or failure. - Error GetInput(const std::string& name, std::shared_ptr* input) const; - - /// Get a named output. - /// \param name The name of the output. - /// \param output Returns the Output object for 'name'. - /// \return Error object indicating success or failure. - Error GetOutput( - const std::string& name, std::shared_ptr* output) const; - - /// Set the options to use for all subsequent Run() invocations. - /// \param options The options. - /// \return Error object indicating success or failure. - Error SetRunOptions(const Options& options); - - /// Get the current statistics of the InferContext. - /// \param stat Returns the Stat object holding the statistics. - /// \return Error object indicating success or failure. - Error GetStat(Stat* stat); - - /// Send a synchronous request to the inference server to perform an - /// inference to produce results for the outputs specified in the - /// most recent call to SetRunOptions(). The Result objects holding - /// the output values are returned in the same order as the outputs - /// are specified in the options. - /// \param results Returns Result objects holding inference results. - /// \return Error object indicating success or failure. - virtual Error Run(std::vector>* results) = 0; - - /// Send an asynchronous request to the inference server to perform - /// an inference to produce results for the outputs specified in the - /// most recent call to SetRunOptions(). - /// \param async_request Returns a Request object that can be used - /// to retrieve the inference results for the request. - /// \return Error object indicating success or failure. - virtual Error AsyncRun(std::shared_ptr* async_request) = 0; - - /// Get the results of the asynchronous request referenced by 'async_request'. - /// The Result objects holding the output values are returned in the same - /// order as the outputs are specified in the options when AsyncRun() was - /// called. \param results Return Result objects holding inference results. - /// \param async_request Request handle to retrieve results. - /// \param wait If true, block until the request completes. Otherwise, return - /// immediately. - /// \return Error object indicating success or failure. Success will be - /// returned only if the request has been completed succesfully. UNAVAILABLE - /// will be returned if 'wait' is false and the request is not ready. - virtual Error GetAsyncRunResults( - std::vector>* results, - const std::shared_ptr& async_request, bool wait) = 0; - - /// Get any one completed asynchronous request. - /// \param async_request Returns the Request object holding the - /// completed request. - /// \param wait If true, block until the request completes. Otherwise, return - /// immediately. - /// \return Error object indicating success or failure. Success will be - /// returned only if a completed request was returned.. UNAVAILABLE - /// will be returned if 'wait' is false and no request is ready. - Error GetReadyAsyncRequest( - std::shared_ptr* async_request, bool wait); - - protected: - InferContext(const std::string&, int, bool); - - // Function for worker thread to proceed the data transfer for all requests - virtual void AsyncTransfer() = 0; - - // Helper function called before inference to prepare 'request' - virtual Error PreRunProcessing(std::shared_ptr& request) = 0; - - // Helper function called by GetAsyncRunResults() to check if the request - // is ready. If the request is valid and wait == true, - // the function will block until request is ready. - Error IsRequestReady( - const std::shared_ptr& async_request, bool wait); - - // Update the context stat with the given timer - Error UpdateStat(const RequestTimers& timer); - - using AsyncReqMap = std::map>; - - // map to record ongoing asynchronous requests with pointer to easy handle - // as key - AsyncReqMap ongoing_async_requests_; - - // Model name - const std::string model_name_; - - // Model version - const int model_version_; - - // If true print verbose output - const bool verbose_; - - // Maximum batch size supported by this context. A maximum batch - // size indicates that the context does not support batching and so - // only a single inference at a time can be performed. - uint64_t max_batch_size_; - - // Total size of all inputs, in bytes (must be 64-bit integer - // because used with curl_easy_setopt). - uint64_t total_input_byte_size_; - - // Requested batch size for inference request - uint64_t batch_size_; - - // Use to assign unique identifier for each asynchronous request - uint64_t async_request_id_; - - // The inputs and outputs - std::vector> inputs_; - std::vector> outputs_; - - // Settings generated by current option - // InferRequestHeader protobuf describing the request - InferRequestHeader infer_request_; - - // Outputs requested for inference request - std::vector> requested_outputs_; - - // Standalone request context used for synchronous request - std::shared_ptr sync_request_; - - // The statistic of the current context - Stat context_stat_; - - // worker thread that will perform the asynchronous transfer - std::thread worker_; - - // Avoid race condition between main thread and worker thread - std::mutex mutex_; - - // Condition variable used for waiting on asynchronous request - std::condition_variable cv_; - - // signal for worker thread to stop - bool exiting_; -}; - -//============================================================================== -/// A ProfileContext object is used to control profiling on the -/// inference server. Once created a ProfileContext object can be used -/// repeatedly. -/// -/// A ProfileContext object can use either HTTP protocol or gRPC protocol -/// depending on the Create function (ProfileHttpContext::Create or -/// ProfileGrpcContext::Create). For example: -/// -/// \code -/// std::unique_ptr ctx; -/// ProfileGrpcContext::Create(&ctx, "localhost:8000"); -/// ctx->StartProfile(); -/// ... -/// ctx->StopProfile(); -/// ... -/// \endcode -/// -/// \note -/// ProfileContext::Create methods are thread-safe. StartProfiling() -/// and StopProfiling() are not thread-safe. For a given -/// ProfileContext, calls to these methods must be serialized. -/// -class ProfileContext { - public: - /// Start profiling on the inference server. - /// \return Error object indicating success or failure. - Error StartProfile(); - - /// Stop profiling on the inference server. - // \return Error object indicating success or failure. - Error StopProfile(); - - protected: - ProfileContext(bool); - virtual Error SendCommand(const std::string& cmd_str) = 0; - - // If true print verbose output - const bool verbose_; -}; - -//============================================================================== -/// ServerHealthHttpContext is the HTTP instantiation of -/// ServerHealthContext. -/// -class ServerHealthHttpContext : public ServerHealthContext { - public: - /// Create a context that returns health information. - /// \param ctx Returns a new ServerHealthHttpContext object. - /// \param server_url The inference server name and port. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose = false); - - Error GetReady(bool* ready) override; - Error GetLive(bool* live) override; - - private: - ServerHealthHttpContext(const std::string&, bool); - Error GetHealth(const std::string& url, bool* health); - - // URL for health endpoint on inference server. - const std::string url_; -}; - -//============================================================================== -/// ServerStatusHttpContext is the HTTP instantiation of -/// ServerStatusContext. -/// -class ServerStatusHttpContext : public ServerStatusContext { - public: - /// Create a context that returns information about an inference - /// server and all models on the server using HTTP protocol. - /// \param ctx Returns a new ServerStatusHttpContext object. - /// \param server_url The inference server name and port. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose = false); - - /// Create a context that returns information about an inference - /// server and one model on the sever using HTTP protocol. - /// \param ctx Returns a new ServerStatusHttpContext object. - /// \param server_url The inference server name and port. - /// \param model_name The name of the model to get status for. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, bool verbose = false); - - /// Contact the inference server and get status. - /// \param status Returns the status. - /// \return Error object indicating success or failure. - Error GetServerStatus(ServerStatus* status) override; - - private: - static size_t ResponseHeaderHandler(void*, size_t, size_t, void*); - static size_t ResponseHandler(void*, size_t, size_t, void*); - - ServerStatusHttpContext(const std::string&, bool); - ServerStatusHttpContext(const std::string&, const std::string&, bool); - - // URL for status endpoint on inference server. - const std::string url_; - - // RequestStatus received in server response - RequestStatus request_status_; - - // Serialized ServerStatus response from server. - std::string response_; -}; - -//============================================================================== -/// InferHttpContext is the HTTP instantiation of InferContext. -/// -class InferHttpContext : public InferContext { - public: - ~InferHttpContext() override; - - /// Create context that performs inference for a model using HTTP protocol. - /// \param ctx Returns a new InferHttpContext object. - /// \param server_url The inference server name and port. - /// \param model_name The name of the model to get status for. - /// \param model_version The version of the model to use for inference, - /// or -1 to indicate that the latest (i.e. highest version number) - /// version should be used. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, int model_version = -1, - bool verbose = false); - - Error Run(std::vector>* results) override; - Error AsyncRun(std::shared_ptr* async_request) override; - Error GetAsyncRunResults( - std::vector>* results, - const std::shared_ptr& async_request, bool wait) override; - - private: - static size_t RequestProvider(void*, size_t, size_t, void*); - static size_t ResponseHeaderHandler(void*, size_t, size_t, void*); - static size_t ResponseHandler(void*, size_t, size_t, void*); - - InferHttpContext(const std::string&, const std::string&, int, bool); - - // @see InferContext.AsyncTransfer() - void AsyncTransfer() override; - - // @see InferContext.PreRunProcessing() - Error PreRunProcessing(std::shared_ptr& request) override; - - // curl multi handle for processing asynchronous requests - CURLM* multi_handle_; - - // URL to POST to - std::string url_; - - // Serialized InferRequestHeader - std::string infer_request_str_; - - // Keep an easy handle alive to reuse the connection - CURL* curl_; -}; - -//============================================================================== -/// ProfileHttpContext is the HTTP instantiation of ProfileContext. -/// -class ProfileHttpContext : public ProfileContext { - public: - /// Create context that controls profiling on a server using HTTP - /// protocol. - /// \param ctx Returns the new ProfileContext object. - /// \param server_url The inference server name and port. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose = false); - - private: - static size_t ResponseHeaderHandler(void*, size_t, size_t, void*); - - ProfileHttpContext(const std::string&, bool); - Error SendCommand(const std::string& cmd_str) override; - - // URL for status endpoint on inference server. - const std::string url_; - - // RequestStatus received in server response - RequestStatus request_status_; -}; - -//============================================================================== -/// ServerHealthGrpcContext is the gRPC instantiation of -/// ServerHealthContext. -/// -class ServerHealthGrpcContext : public ServerHealthContext { - public: - /// Create a context that returns health information about server. - /// \param ctx Returns a new ServerHealthGrpcContext object. - /// \param server_url The inference server name and port. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose = false); - - Error GetReady(bool* ready) override; - Error GetLive(bool* live) override; - - private: - ServerHealthGrpcContext(const std::string&, bool); - Error GetHealth(const std::string& mode, bool* health); - - // gRPC end point. - std::unique_ptr stub_; -}; - -//============================================================================== -/// ServerStatusGrpcContext is the gRPC instantiation of -/// ServerStatusContext. -/// -class ServerStatusGrpcContext : public ServerStatusContext { - public: - /// Create a context that returns information about an inference - /// server and all models on the server using gRPC protocol. - /// \param ctx Returns a new ServerStatusGrpcContext object. - /// \param server_url The inference server name and port. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose = false); - - /// Create a context that returns information about an inference - /// server and one model on the sever using gRPC protocol. - /// \param ctx Returns a new ServerStatusGrpcContext object. - /// \param server_url The inference server name and port. - /// \param model_name The name of the model to get status for. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, bool verbose = false); - - /// Contact the inference server and get status. - /// \param status Returns the status. - /// \return Error object indicating success or failure. - Error GetServerStatus(ServerStatus* status) override; - - private: - ServerStatusGrpcContext(const std::string&, bool); - ServerStatusGrpcContext(const std::string&, const std::string&, bool); - - // Model name - const std::string model_name_; - - // gRPC end point. - std::unique_ptr stub_; -}; - -//============================================================================== -/// InferGrpcContext is the gRPC instantiation of InferContext. -/// -class InferGrpcContext : public InferContext { - public: - ~InferGrpcContext() override; - - /// Create context that performs inference for a model using gRPC protocol. - /// \param ctx Returns a new InferGrpcContext object. - /// \param server_url The inference server name and port. - /// \param model_name The name of the model to get status for. - /// \param model_version The version of the model to use for inference, - /// or -1 to indicate that the latest (i.e. highest version number) - /// version should be used. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - const std::string& model_name, int model_version = -1, - bool verbose = false); - - Error Run(std::vector>* results) override; - Error AsyncRun(std::shared_ptr* async_request) override; - Error GetAsyncRunResults( - std::vector>* results, - const std::shared_ptr& async_request, bool wait) override; - - private: - InferGrpcContext(const std::string&, const std::string&, int, bool); - - // @see InferContext.AsyncTransfer() - void AsyncTransfer() override; - - // @see InferContext.PreRunProcessing() - Error PreRunProcessing(std::shared_ptr& request) override; - - // additional vector contains 1-indexed key to available slots - // in async request map. - std::vector reusable_slot_; - - // The producer-consumer queue used to communicate asynchronously with - // the gRPC runtime. - grpc::CompletionQueue async_request_completion_queue_; - - // gRPC end point. - std::unique_ptr stub_; - - // request for gRPC call, one request object can be used for multiple calls - // since it can be overwritten as soon as the gRPC send finishes. - InferRequest request_; -}; - -//============================================================================== -//// ProfileGrpcContext is the gRPC instantiation of ProfileContext. -//// -class ProfileGrpcContext : public ProfileContext { - public: - /// Create context that controls profiling on a server using gRPC - /// protocol. - /// \param ctx Returns the new ProfileContext object. - /// \param server_url The inference server name and port. - /// \param verbose If true generate verbose output when contacting - /// the inference server. - /// \return Error object indicating success or failure. - static Error Create( - std::unique_ptr* ctx, const std::string& server_url, - bool verbose = false); - - private: - ProfileGrpcContext(const std::string&, bool); - Error SendCommand(const std::string& cmd_str) override; - - // gRPC end point. - std::unique_ptr stub_; -}; - -//============================================================================== - -std::ostream& operator<<(std::ostream&, const Error&); - -template -Error -InferContext::Result::GetRawAtCursor(size_t batch_idx, T* out) -{ - const uint8_t* buf; - Error err = GetRawAtCursor(batch_idx, &buf, sizeof(T)); - if (!err.IsOk()) { - return err; - } - - std::copy(buf, buf + sizeof(T), reinterpret_cast(out)); - return Error::Success; -} - -}}} // namespace nvidia::inferenceserver::client diff --git a/src/clients/c++/simple_client.cc b/src/clients/c++/simple_client.cc deleted file mode 100644 index 779e820afb..0000000000 --- a/src/clients/c++/simple_client.cc +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "src/clients/c++/request.h" - -#include -#include -#include - -namespace ni = nvidia::inferenceserver; -namespace nic = nvidia::inferenceserver::client; - -#define FAIL_IF_ERR(X, MSG) \ - { \ - nic::Error err = (X); \ - if (!err.IsOk()) { \ - std::cerr << "error: " << (MSG) << ": " << err << std::endl; \ - exit(1); \ - } \ - } - -namespace { - -void -Usage(char** argv, const std::string& msg = std::string()) -{ - if (!msg.empty()) { - std::cerr << "error: " << msg << std::endl; - } - - std::cerr << "Usage: " << argv[0] << " [options]" << std::endl; - std::cerr << "\t-v" << std::endl; - std::cerr << "\t-i " - << std::endl; - std::cerr << "\t-u " << std::endl; - std::cerr << std::endl; - std::cerr - << "For -i, available protocols are 'grpc' and 'http'. Default is 'http." - << std::endl; - - exit(1); -} - -} // namespace - -int -main(int argc, char** argv) -{ - bool verbose = false; - std::string url("localhost:8000"); - std::string protocol = "http"; - - // Parse commandline... - int opt; - while ((opt = getopt(argc, argv, "vi:u:")) != -1) { - switch (opt) { - case 'v': - verbose = true; - break; - case 'i': - protocol = optarg; - break; - case 'u': - url = optarg; - break; - case '?': - Usage(argv); - break; - } - } - - nic::Error err; - - // We use a simple model that takes 2 input tensors of 16 integers - // each and returns 2 output tensors of 16 integers each. One output - // tensor is the element-wise sum of the inputs and one output is - // the element-wise difference. - std::string model_name = "simple"; - - // Create the inference context for the model. - std::unique_ptr ctx; - if (protocol == "http") { - err = nic::InferHttpContext::Create( - &ctx, url, model_name, -1 /* model_version */, verbose); - } else if (protocol == "grpc") { - err = nic::InferGrpcContext::Create( - &ctx, url, model_name, -1 /* model_version */, verbose); - } else { - Usage(argv, "unknown protocol '" + protocol + "'"); - } - - if (!err.IsOk()) { - std::cerr << "error: unable to create inference context: " << err - << std::endl; - exit(1); - } - - // Set the context options to do batch-size 1 requests. Also request - // that all output tensors be returned. - std::unique_ptr options; - FAIL_IF_ERR( - nic::InferContext::Options::Create(&options), - "unable to create inference options"); - - options->SetBatchSize(1); - for (const auto& output : ctx->Outputs()) { - options->AddRawResult(output); - } - - FAIL_IF_ERR(ctx->SetRunOptions(*options), "unable to set inference options"); - - // Create the data for the two input tensors. Initialize the first - // to unique integers and the second to all ones. - std::vector input0_data(16); - std::vector input1_data(16); - for (size_t i = 0; i < 16; ++i) { - input0_data[i] = i; - input1_data[i] = 1; - } - - // Initialize the inputs with the data. - std::shared_ptr input0, input1; - FAIL_IF_ERR(ctx->GetInput("INPUT0", &input0), "unable to get INPUT0"); - FAIL_IF_ERR(ctx->GetInput("INPUT1", &input1), "unable to get INPUT1"); - - FAIL_IF_ERR(input0->Reset(), "unable to reset INPUT0"); - FAIL_IF_ERR(input1->Reset(), "unable to reset INPUT1"); - - FAIL_IF_ERR( - input0->SetRaw( - reinterpret_cast(&input0_data[0]), input0->ByteSize()), - "unable to set data for INPUT0"); - FAIL_IF_ERR( - input1->SetRaw( - reinterpret_cast(&input1_data[0]), input1->ByteSize()), - "unable to set data for INPUT1"); - - // Send inference request to the inference server. - std::vector> results; - FAIL_IF_ERR(ctx->Run(&results), "unable to run model"); - - // We expect there to be 2 results. Walk over all 16 result elements - // and print the sum and difference calculated by the model. - if (results.size() != 2) { - std::cerr << "error: expected 2 results, got " << results.size() - << std::endl; - } - - for (size_t i = 0; i < 16; ++i) { - int32_t r0, r1; - FAIL_IF_ERR( - results[0]->GetRawAtCursor(0 /* batch idx */, &r0), - "unable to get OUTPUT0 result at idx " + std::to_string(i)); - FAIL_IF_ERR( - results[1]->GetRawAtCursor(0 /* batch idx */, &r1), - "unable to get OUTPUT1 result at idx " + std::to_string(i)); - std::cout << input0_data[i] << " + " << input1_data[i] << " = " << r0 - << std::endl; - std::cout << input0_data[i] << " - " << input1_data[i] << " = " << r1 - << std::endl; - - if ((input0_data[i] + input1_data[i]) != r0) { - std::cerr << "error: incorrect sum" << std::endl; - exit(1); - } - if ((input0_data[i] - input1_data[i]) != r1) { - std::cerr << "error: incorrect difference" << std::endl; - exit(1); - } - } - - return 0; -} diff --git a/src/clients/python/BUILD b/src/clients/python/BUILD deleted file mode 100644 index b3ed6241d3..0000000000 --- a/src/clients/python/BUILD +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -package( - default_visibility = ["//visibility:public"], -) - -cc_library( - name = "crequest_base", - srcs = ["crequest.cc"], - hdrs = ["crequest.h"], - deps = [ - "//src/clients/c++:request", - ], -) - -cc_binary( - name = "libcrequest.so", - deps = [ - ":crequest_base", - "//src/clients/c++:request", - "//src/core:api_proto", - "//src/core:grpc_service_proto", - "//src/core:model_config_proto", - "//src/core:request_status_proto", - "//src/core:server_status_proto", - ], - linkshared = 1, - linkopts = [ - "-lcurl", - "-lz" - ], -) - -sh_binary( - name = "build_pip", - srcs = ["build_pip.sh"], - data = [ - "setup.py", - ":libcrequest.so", - "//src/core:api_proto_py_pb2", - "//src/core:grpc_service_proto_py_pb2", - "//src/core:model_config_proto_py_pb2", - "//src/core:request_status_proto_py_pb2", - "//src/core:server_status_proto_py_pb2", - ], -) diff --git a/src/clients/python/__init__.py b/src/clients/python/__init__.py deleted file mode 100644 index eab109f10c..0000000000 --- a/src/clients/python/__init__.py +++ /dev/null @@ -1,968 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from builtins import range -from enum import IntEnum -from future.utils import iteritems -from ctypes import * -import numpy as np -from numpy.ctypeslib import ndpointer -import pkg_resources -import tensorrtserver.api.model_config_pb2 -from tensorrtserver.api.server_status_pb2 import ServerStatus - -class _utf8(object): - @classmethod - def from_param(cls, value): - if value is None: - return None - elif isinstance(value, bytes): - return value - else: - return value.encode('utf8') - -_crequest_path = pkg_resources.resource_filename('tensorrtserver.api', 'libcrequest.so') -_crequest = cdll.LoadLibrary(_crequest_path) - -_crequest_error_new = _crequest.ErrorNew -_crequest_error_new.restype = c_void_p -_crequest_error_new.argtypes = [_utf8] -_crequest_error_del = _crequest.ErrorDelete -_crequest_error_del.argtypes = [c_void_p] -_crequest_error_isok = _crequest.ErrorIsOk -_crequest_error_isok.restype = c_bool -_crequest_error_isok.argtypes = [c_void_p] -_crequest_error_isunavailable = _crequest.ErrorIsUnavailable -_crequest_error_isunavailable.restype = c_bool -_crequest_error_isunavailable.argtypes = [c_void_p] -_crequest_error_msg = _crequest.ErrorMessage -_crequest_error_msg.restype = c_char_p -_crequest_error_msg.argtypes = [c_void_p] -_crequest_error_serverid = _crequest.ErrorServerId -_crequest_error_serverid.restype = c_char_p -_crequest_error_serverid.argtypes = [c_void_p] -_crequest_error_requestid = _crequest.ErrorRequestId -_crequest_error_requestid.restype = c_int64 -_crequest_error_requestid.argtypes = [c_void_p] - -_crequest_health_ctx_new = _crequest.ServerHealthContextNew -_crequest_health_ctx_new.restype = c_void_p -_crequest_health_ctx_new.argtypes = [POINTER(c_void_p), _utf8, c_int, c_bool] -_crequest_health_ctx_del = _crequest.ServerHealthContextDelete -_crequest_health_ctx_del.argtypes = [c_void_p] -_crequest_health_ctx_ready = _crequest.ServerHealthContextGetReady -_crequest_health_ctx_ready.restype = c_void_p -_crequest_health_ctx_ready.argtypes = [c_void_p, POINTER(c_bool)] -_crequest_health_ctx_live = _crequest.ServerHealthContextGetLive -_crequest_health_ctx_live.restype = c_void_p -_crequest_health_ctx_live.argtypes = [c_void_p, POINTER(c_bool)] - -_crequest_status_ctx_new = _crequest.ServerStatusContextNew -_crequest_status_ctx_new.restype = c_void_p -_crequest_status_ctx_new.argtypes = [POINTER(c_void_p), _utf8, c_int, _utf8, c_bool] -_crequest_status_ctx_del = _crequest.ServerStatusContextDelete -_crequest_status_ctx_del.argtypes = [c_void_p] -_crequest_status_ctx_get = _crequest.ServerStatusContextGetServerStatus -_crequest_status_ctx_get.restype = c_void_p -_crequest_status_ctx_get.argtypes = [c_void_p, POINTER(c_char_p), POINTER(c_uint32)] - -_crequest_infer_ctx_new = _crequest.InferContextNew -_crequest_infer_ctx_new.restype = c_void_p -_crequest_infer_ctx_new.argtypes = [POINTER(c_void_p), _utf8, c_int, _utf8, c_int, c_bool] -_crequest_infer_ctx_del = _crequest.InferContextDelete -_crequest_infer_ctx_del.argtypes = [c_void_p] -_crequest_infer_ctx_set_options = _crequest.InferContextSetOptions -_crequest_infer_ctx_set_options.restype = c_void_p -_crequest_infer_ctx_set_options.argtypes = [c_void_p, c_void_p] -_crequest_infer_ctx_run = _crequest.InferContextRun -_crequest_infer_ctx_run.restype = c_void_p -_crequest_infer_ctx_run.argtypes = [c_void_p] -_crequest_infer_ctx_async_run = _crequest.InferContextAsyncRun -_crequest_infer_ctx_async_run.restype = c_void_p -_crequest_infer_ctx_async_run.argtypes = [c_void_p, POINTER(c_uint64)] -_crequest_infer_ctx_get_async_run_results = _crequest.InferContextGetAsyncRunResults -_crequest_infer_ctx_get_async_run_results.restype = c_void_p -_crequest_infer_ctx_get_async_run_results.argtypes = [c_void_p, c_uint64, c_bool] -_crequest_infer_ctx_get_ready_async_request = _crequest.InferContextGetReadyAsyncRequest -_crequest_infer_ctx_get_ready_async_request.restype = c_void_p -_crequest_infer_ctx_get_ready_async_request.argtypes = [c_void_p, POINTER(c_uint64), c_bool] - -_crequest_infer_ctx_options_new = _crequest.InferContextOptionsNew -_crequest_infer_ctx_options_new.restype = c_void_p -_crequest_infer_ctx_options_new.argtypes = [POINTER(c_void_p), c_uint64] -_crequest_infer_ctx_options_del = _crequest.InferContextOptionsDelete -_crequest_infer_ctx_options_del.argtypes = [c_void_p] -_crequest_infer_ctx_options_add_raw = _crequest.InferContextOptionsAddRaw -_crequest_infer_ctx_options_add_raw.restype = c_void_p -_crequest_infer_ctx_options_add_raw.argtypes = [c_void_p, c_void_p, _utf8] -_crequest_infer_ctx_options_add_class = _crequest.InferContextOptionsAddClass -_crequest_infer_ctx_options_add_class.restype = c_void_p -_crequest_infer_ctx_options_add_class.argtypes = [c_void_p, c_void_p, _utf8, c_uint64] - -_crequest_infer_ctx_input_new = _crequest.InferContextInputNew -_crequest_infer_ctx_input_new.restype = c_void_p -_crequest_infer_ctx_input_new.argtypes = [POINTER(c_void_p), c_void_p, _utf8] -_crequest_infer_ctx_input_del = _crequest.InferContextInputDelete -_crequest_infer_ctx_input_del.argtypes = [c_void_p] -_crequest_infer_ctx_input_set_raw = _crequest.InferContextInputSetRaw -_crequest_infer_ctx_input_set_raw.restype = c_void_p -_crequest_infer_ctx_input_set_raw.argtypes = [c_void_p, c_void_p, c_uint64] - -_crequest_infer_ctx_result_new = _crequest.InferContextResultNew -_crequest_infer_ctx_result_new.restype = c_void_p -_crequest_infer_ctx_result_new.argtypes = [POINTER(c_void_p), c_void_p, _utf8] -_crequest_infer_ctx_result_del = _crequest.InferContextResultDelete -_crequest_infer_ctx_result_del.argtypes = [c_void_p] -_crequest_infer_ctx_result_modelname = _crequest.InferContextResultModelName -_crequest_infer_ctx_result_modelname.restype = c_void_p -_crequest_infer_ctx_result_modelname.argtypes = [c_void_p, POINTER(c_char_p)] -_crequest_infer_ctx_result_modelver = _crequest.InferContextResultModelVersion -_crequest_infer_ctx_result_modelver.restype = c_void_p -_crequest_infer_ctx_result_modelver.argtypes = [c_void_p, POINTER(c_uint32)] -_crequest_infer_ctx_result_dtype = _crequest.InferContextResultDataType -_crequest_infer_ctx_result_dtype.restype = c_void_p -_crequest_infer_ctx_result_dtype.argtypes = [c_void_p, POINTER(c_uint32)] -_crequest_infer_ctx_result_dims = _crequest.InferContextResultDims -_crequest_infer_ctx_result_dims.restype = c_void_p -_crequest_infer_ctx_result_dims.argtypes = [c_void_p, c_uint64, - ndpointer(c_uint32, flags="C_CONTIGUOUS"), - POINTER(c_uint64)] -_crequest_infer_ctx_result_next_raw = _crequest.InferContextResultNextRaw -_crequest_infer_ctx_result_next_raw.restype = c_void_p -_crequest_infer_ctx_result_next_raw.argtypes = [c_void_p, c_uint64, POINTER(c_char_p), - POINTER(c_uint64)] -_crequest_infer_ctx_result_class_cnt = _crequest.InferContextResultClassCount -_crequest_infer_ctx_result_class_cnt.restype = c_void_p -_crequest_infer_ctx_result_class_cnt.argtypes = [c_void_p, c_uint64, POINTER(c_uint64)] -_crequest_infer_ctx_result_next_class = _crequest.InferContextResultNextClass -_crequest_infer_ctx_result_next_class.restype = c_void_p -_crequest_infer_ctx_result_next_class.argtypes = [c_void_p, c_uint64, POINTER(c_uint64), - POINTER(c_float), POINTER(c_char_p)] - - -def _raise_if_error(err): - """ - Raise InferenceServerException if 'err' is non-success. - Otherwise return the request ID. - """ - if err.value is not None: - ex = InferenceServerException(err) - isok = _crequest_error_isok(err) - _crequest_error_del(err) - if not isok: - raise ex - return ex.request_id() - return 0 - -def _raise_error(msg): - err = c_void_p(_crequest_error_new(msg)) - ex = InferenceServerException(err) - _crequest_error_del(err) - raise ex - - -class ProtocolType(IntEnum): - """Protocol types supported by the client API - - HTTP - The HTTP protocol. - GRPC - The GRPC protocol. - - """ - HTTP = 0 - GRPC = 1 - - @classmethod - def from_str(cls, value): - """Convert a string to the corresponding ProtocolType. - - Parameters - ---------- - value : str - The string value to convert. - - Returns - ------- - ProtocolType - The ProtocolType corresponding to 'value'. - - Raises - ------ - Exception - If 'value' is an unknown protocol. - - """ - if value.lower() == 'http': - return ProtocolType.HTTP - elif value.lower() == 'grpc': - return ProtocolType.GRPC - raise Exception("unexpected protocol: " + value + - ", expecting HTTP or gRPC") - return ProtocolType.HTTP - -class InferenceServerException(Exception): - """Exception indicating non-Success status. - - Parameters - ---------- - err : c_void_p - Pointer to an Error that should be used to initialize the exception. - - """ - def __init__(self, err): - self._msg = None - self._server_id = None - self._request_id = 0 - if (err is not None) and (err.value is not None): - self._msg = _crequest_error_msg(err) - if self._msg is not None: - self._msg = self._msg.decode('utf-8') - self._server_id = _crequest_error_serverid(err) - if self._server_id is not None: - self._server_id = self._server_id.decode('utf-8') - self._request_id = _crequest_error_requestid(err) - - def __str__(self): - msg = super().__str__() if self._msg is None else self._msg - if self._server_id is not None: - msg = '[' + self._server_id + ' ' + str(self._request_id) + '] ' + msg - return msg - - def message(self): - """Get the exception message. - - Returns - ------- - str - The message associated with this exception, or None if no message. - - """ - return self._msg - - def server_id(self): - """Get the ID of the server associated with this exception. - - Returns - ------- - str - The ID of the server associated with this exception, or - None if no server is associated. - - """ - return self._server_id - - def request_id(self): - """Get the ID of the request with this exception. - - Returns - ------- - int - The ID of the request associated with this exception, or - 0 (zero) if no request is associated. - - """ - return self._request_id - -class ServerHealthContext: - """Performs a health request to an inference server. - - Parameters - ---------- - url : str - The inference server URL, e.g. localhost:8000. - - protocol : ProtocolType - The protocol used to communicate with the server. - - verbose : bool - If True generate verbose output. - - """ - def __init__(self, url, protocol, verbose=False): - self._last_request_id = 0 - self._ctx = c_void_p() - _raise_if_error( - c_void_p( - _crequest_health_ctx_new( - byref(self._ctx), url, int(protocol), verbose))) - - def __del__(self): - # when module is unloading may get called after - # _crequest_health_ctx_del has been released - if _crequest_health_ctx_del is not None: - self.close() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - def close(self): - """Close the context. Any future calls to is_ready() or is_live() will - result in an Error. - - """ - _crequest_health_ctx_del(self._ctx) - self._ctx = None - - def is_ready(self): - """Contact the inference server and get readiness. - - Returns - ------- - bool - True if server is ready, False if server is not ready. - - Raises - ------ - InferenceServerException - If unable to get readiness. - - """ - self._last_request_id = None - if self._ctx is None: - _raise_error("ServerHealthContext is closed") - - cready = c_bool() - self._last_request_id = _raise_if_error( - c_void_p(_crequest_health_ctx_ready(self._ctx, byref(cready)))) - return cready.value - - def is_live(self): - """Contact the inference server and get liveness. - - Returns - ------- - bool - True if server is live, False if server is not live. - - Raises - ------ - InferenceServerException - If unable to get liveness. - - """ - self._last_request_id = None - if self._ctx is None: - _raise_error("ServerHealthContext is closed") - - clive = c_bool() - self._last_request_id = _raise_if_error( - c_void_p(_crequest_health_ctx_live(self._ctx, byref(clive)))) - return clive.value - - def get_last_request_id(self): - """Get the request ID of the most recent is_ready() or is_live() - request. - - Returns - ------- - int - The request ID, or None if a request has not yet been made - or if the last request was not successful. - - """ - return self._last_request_id - - -class ServerStatusContext: - """Performs a status request to an inference server. - - A request can be made to get status for the server and all models - managed by the server, or to get status foronly a single model. - - Parameters - ---------- - url : str - The inference server URL, e.g. localhost:8000. - - protocol : ProtocolType - The protocol used to communicate with the server. - - model_name : str - The name of the model to get status for, or None to get status - for all models managed by the server. - - verbose : bool - If True generate verbose output. - - """ - def __init__(self, url, protocol, model_name=None, verbose=False): - self._last_request_id = 0 - self._ctx = c_void_p() - _raise_if_error( - c_void_p( - _crequest_status_ctx_new( - byref(self._ctx), url, int(protocol), model_name, verbose))) - - def __del__(self): - # when module is unloading may get called after - # _crequest_status_ctx_del has been released - if _crequest_status_ctx_del is not None: - self.close() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - def close(self): - """Close the context. Any future calls to get_server_status() will - result in an Error. - - """ - _crequest_status_ctx_del(self._ctx) - self._ctx = None - - def get_server_status(self): - """Contact the inference server and get status. - - Returns - ------- - ServerStatus - The ServerStatus protobuf containing the status. - - Raises - ------ - InferenceServerException - If unable to get status. - - """ - self._last_request_id = None - if self._ctx is None: - _raise_error("ServerStatusContext is closed") - - cstatus = c_char_p() - cstatus_len = c_uint32() - self._last_request_id = _raise_if_error( - c_void_p(_crequest_status_ctx_get( - self._ctx, byref(cstatus), byref(cstatus_len)))) - status_buf = cast(cstatus, POINTER(c_byte * cstatus_len.value))[0] - - status = ServerStatus() - status.ParseFromString(status_buf) - return status - - def get_last_request_id(self): - """Get the request ID of the most recent get_server_status() request. - - Returns - ------- - int - The request ID, or None if a request has not yet been made - or if the last request was not successful. - - """ - return self._last_request_id - - -class InferContext: - """An InferContext object is used to run inference on an inference - server for a specific model. - - Once created an InferContext object can be used repeatedly to - perform inference using the model. - - Parameters - ---------- - url : str - The inference server URL, e.g. localhost:8000. - - protocol : ProtocolType - The protocol used to communicate with the server. - - model_name : str - The name of the model to get status for, or None to get status - for all models managed by the server. - - model_version : int - The version of the model to use for inference, - or None to indicate that the latest (i.e. highest version number) - version should be used. - - verbose : bool - If True generate verbose output. - - """ - class ResultFormat: - """Formats for output tensor results. - - RAW - All values of the output are returned as an numpy array - of the appropriate type. - - CLASS - Specified as tuple (CLASS, k). Top 'k' results - are returned as an array of (index, value, label) tuples. - - """ - RAW = 1, - CLASS = 2 - - def __init__(self, url, protocol, model_name, model_version=None, verbose=False): - self._last_request_id = None - self._last_request_model_name = None - self._last_request_model_version = None - self._requested_outputs_dict = dict() - self._ctx = c_void_p() - - imodel_version = -1 if model_version is None else model_version - _raise_if_error( - c_void_p( - _crequest_infer_ctx_new( - byref(self._ctx), url, int(protocol), - model_name, imodel_version, verbose))) - - def __del__(self): - # when module is unloading may get called after - # _crequest_infer_ctx_del has been released - if _crequest_infer_ctx_del is not None: - self.close() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - def _get_result_numpy_dtype(self, result): - ctype = c_uint32() - _raise_if_error(c_void_p(_crequest_infer_ctx_result_dtype(result, byref(ctype)))) - if ctype.value == model_config_pb2.TYPE_BOOL: - return np.bool_ - elif ctype.value == model_config_pb2.TYPE_UINT8: - return np.uint8 - elif ctype.value == model_config_pb2.TYPE_UINT16: - return np.uint16 - elif ctype.value == model_config_pb2.TYPE_UINT32: - return np.uint32 - elif ctype.value == model_config_pb2.TYPE_UINT64: - return np.uint64 - elif ctype.value == model_config_pb2.TYPE_INT8: - return np.int8 - elif ctype.value == model_config_pb2.TYPE_INT16: - return np.int16 - elif ctype.value == model_config_pb2.TYPE_INT32: - return np.int32 - elif ctype.value == model_config_pb2.TYPE_INT64: - return np.int64 - elif ctype.value == model_config_pb2.TYPE_FP16: - return np.float16 - elif ctype.value == model_config_pb2.TYPE_FP32: - return np.float32 - elif ctype.value == model_config_pb2.TYPE_FP64: - return np.float64 - _raise_error("unknown result datatype " + ctype.value) - - def _prepare_request(self, inputs, outputs, batch_size, contiguous_input_values): - # Make sure each input is given as a list (one entry per - # batch). It is a common error when using batch-size 1 to - # specify an input directly as an array instead of as a list - # containing one array. - for inp_name, inp in inputs.items(): - if not isinstance(inp, (list, tuple)): - _raise_error("input '" + inp_name + - "' values must be specified as a list or numpy arrays") - - # Set run options using formats specified in 'outputs' - options = c_void_p() - try: - _raise_if_error(c_void_p(_crequest_infer_ctx_options_new(byref(options), batch_size))) - - for (output_name, output_format) in iteritems(outputs): - if output_format == InferContext.ResultFormat.RAW: - _raise_if_error( - c_void_p( - _crequest_infer_ctx_options_add_raw(self._ctx, options, output_name))) - elif (isinstance(output_format, (list, tuple)) and - (output_format[0] == InferContext.ResultFormat.CLASS)): - _raise_if_error( - c_void_p( - _crequest_infer_ctx_options_add_class( - self._ctx, options, output_name, c_uint64(output_format[1])))) - else: - _raise_error("unrecognized output format") - - _raise_if_error(c_void_p(_crequest_infer_ctx_set_options(self._ctx, options))) - - finally: - _crequest_infer_ctx_options_del(options) - - # Set the input values in the provided 'contiguous_input_values' - for (input_name, input_values) in iteritems(inputs): - input = c_void_p() - try: - _raise_if_error( - c_void_p(_crequest_infer_ctx_input_new(byref(input), self._ctx, input_name))) - - for input_value in input_values: - if not input_value.flags['C_CONTIGUOUS']: - input_value = np.ascontiguousarray(input_value) - contiguous_input_values.append(input_value) - _raise_if_error( - c_void_p( - _crequest_infer_ctx_input_set_raw( - input, input_value.ctypes.data_as(c_void_p), - c_uint64(input_value.size * input_value.itemsize)))) - finally: - _crequest_infer_ctx_input_del(input) - - def _get_results(self, outputs, batch_size): - # Create the result map. - results = dict() - for (output_name, output_format) in iteritems(outputs): - result = c_void_p() - try: - _raise_if_error( - c_void_p(_crequest_infer_ctx_result_new(byref(result), self._ctx, output_name))) - - # The model name and version are the same for every - # result so only set once - if self._last_request_model_name is None: - cmodelname = c_char_p() - _raise_if_error( - c_void_p( - _crequest_infer_ctx_result_modelname(result, byref(cmodelname)))) - if cmodelname.value is not None: - self._last_request_model_name = cmodelname.value.decode('utf-8') - if self._last_request_model_version is None: - cmodelver = c_uint32() - _raise_if_error( - c_void_p( - _crequest_infer_ctx_result_modelver(result, byref(cmodelver)))) - self._last_request_model_version = cmodelver.value - - result_dtype = self._get_result_numpy_dtype(result) - results[output_name] = list() - if output_format == InferContext.ResultFormat.RAW: - for b in range(batch_size): - # Get the result value into a 1-dim np array - # of the appropriate type - cval = c_char_p() - cval_len = c_uint64() - _raise_if_error( - c_void_p( - _crequest_infer_ctx_result_next_raw( - result, b, byref(cval), byref(cval_len)))) - val_buf = cast(cval, POINTER(c_byte * cval_len.value))[0] - val = np.frombuffer(val_buf, dtype=result_dtype) - # Reshape the result to the appropriate shape - max_shape_dims = 16 - shape = np.zeros(max_shape_dims, dtype=np.uint32) - shape_len = c_uint64() - _raise_if_error( - c_void_p( - _crequest_infer_ctx_result_dims( - result, c_uint64(max_shape_dims), - shape, byref(shape_len)))) - shaped = np.reshape(np.copy(val), np.resize(shape, shape_len.value).tolist()) - results[output_name].append(shaped) - - elif (isinstance(output_format, (list, tuple)) and - (output_format[0] == InferContext.ResultFormat.CLASS)): - for b in range(batch_size): - classes = list() - ccnt = c_uint64() - _raise_if_error( - c_void_p(_crequest_infer_ctx_result_class_cnt(result, b, byref(ccnt)))) - for cc in range(ccnt.value): - cidx = c_uint64() - cprob = c_float() - clabel = c_char_p() - _raise_if_error( - c_void_p( - _crequest_infer_ctx_result_next_class( - result, b, byref(cidx), byref(cprob), byref(clabel)))) - label = None if clabel.value is None else clabel.value.decode('utf-8') - classes.append((cidx.value, cprob.value, label)) - results[output_name].append(classes) - else: - _raise_error("unrecognized output format") - finally: - _crequest_infer_ctx_result_del(result) - - return results - - def close(self): - """Close the context. Any future calls to object will result in an - Error. - - """ - _crequest_infer_ctx_del(self._ctx) - self._ctx = None - - def run(self, inputs, outputs, batch_size=1): - """Run inference using the supplied 'inputs' to calculate the outputs - specified by 'outputs'. - - Parameters - ---------- - inputs : dict - Dictionary from input name to the value(s) for that - input. An input value is specified as a numpy array. Each - input in the dictionary maps to a list of values (i.e. a - list of numpy array objects), where the length of the list - must equal the 'batch_size'. - - outputs : dict - Dictionary from output name to a value indicating the - ResultFormat that should be used for that output. For RAW - the value should be ResultFormat.RAW. For CLASS the value - should be a tuple (ResultFormat.CLASS, k), where 'k' - indicates how many classification results should be - returned for the output. - - batch_size : int - The batch size of the inference. Each input must provide - an appropriately sized batch of inputs. - - Returns - ------- - dict - A dictionary from output name to the list of values for - that output (one list element for each entry of the - batch). The format of a value returned for an output - depends on the output format specified in 'outputs'. For - format RAW a value is a numpy array of the appropriate - type and shape for the output. For format CLASS a value is - the top 'k' output values returned as an array of (class - index, class value, class label) tuples. - - Raises - ------ - InferenceServerException - If all inputs are not specified, if the size of input data - does not match expectations, if unknown output names are - specified or if server fails to perform inference. - - """ - self._last_request_id = None - self._last_request_model_name = None - self._last_request_model_version = None - - # The input values must be contiguous and the lifetime of those - # contiguous copies must span until the inference completes - # so grab a reference to them at this scope. - contiguous_input = list() - - # Set run option and input values - self._prepare_request(inputs, outputs, batch_size, contiguous_input) - - # Run inference... - self._last_request_id = _raise_if_error(c_void_p(_crequest_infer_ctx_run(self._ctx))) - - return self._get_results(outputs, batch_size) - - def async_run(self, inputs, outputs, batch_size=1): - """Run inference using the supplied 'inputs' to calculate the outputs - specified by 'outputs'. - - Unlike run(), async_run() returns immediately after sending - the inference request to the server. The returned integer - identifier must be used subsequently to wait on and retrieve - the actual inference results. - - Parameters - ---------- - inputs : dict - Dictionary from input name to the value(s) for that - input. An input value is specified as a numpy array. Each - input in the dictionary maps to a list of values (i.e. a - list of numpy array objects), where the length of the list - must equal the 'batch_size'. - - outputs : dict - Dictionary from output name to a value indicating the - ResultFormat that should be used for that output. For RAW - the value should be ResultFormat.RAW. For CLASS the value - should be a tuple (ResultFormat.CLASS, k), where 'k' - indicates how many classification results should be - returned for the output. - - batch_size : int - The batch size of the inference. Each input must provide - an appropriately sized batch of inputs. - - - Returns - ------- - int - Integer identifier which must be passed to - get_async_run_results() to wait on and retrieve the - inference results. - - Raises - ------ - InferenceServerException - If all inputs are not specified, if the size of input data - does not match expectations, if unknown output names are - specified or if server fails to perform inference. - - """ - # Same situation as in run(), but the list will be kept inside - # the object given that the request is asynchronous - contiguous_input = list() - - # Set run option and input values - self._prepare_request(inputs, outputs, batch_size, contiguous_input) - - # Run asynchronous inference... - c_request_id = c_uint64() - _raise_if_error( - c_void_p( - _crequest_infer_ctx_async_run(self._ctx, byref(c_request_id)))) - - self._requested_outputs_dict[c_request_id.value] = (outputs, batch_size, contiguous_input) - - return c_request_id.value - - def get_async_run_results(self, request_id, wait): - """Retrieve the results of a previous async_run() using the supplied - 'request_id' - - Parameters - ---------- - request_id : int - The integer ID of the asynchronous request returned by async_run(). - - wait : bool - If True block until the request results are ready. If False return - immediately even if results are not ready. - - Returns - ------- - dict - None if the results are not ready and 'wait' is False. A - dictionary from output name to the list of values for that - output (one list element for each entry of the batch). The - format of a value returned for an output depends on the - output format specified in 'outputs'. For format RAW a - value is a numpy array of the appropriate type and shape - for the output. For format CLASS a value is the top 'k' - output values returned as an array of (class index, class - value, class label) tuples. - - Raises - ------ - InferenceServerException - If the request ID supplied is not valid, or if the server - fails to perform inference. - - """ - # Get async run results - err = c_void_p(_crequest_infer_ctx_get_async_run_results( - self._ctx, request_id, wait)) - - if not wait: - isunavailable = _crequest_error_isunavailable(err) - if isunavailable: - _crequest_error_del(err) - return None - - self._last_request_id = _raise_if_error(err) - - requested_outputs = self._requested_outputs_dict[request_id] - del self._requested_outputs_dict[request_id] - - return self._get_results(requested_outputs[0], requested_outputs[1]) - - def get_ready_async_request(self, wait): - """Get the request ID of an async_run() request that has completed but - not yet had results read with get_async_run_results(). - - Parameters - ---------- - wait : bool - If True block until an async request is ready. If False return - immediately even if results are not ready. - - Returns - ------- - int - None if no asynchronous results are ready and 'wait' is - False. An integer identifier which must be passed to - get_async_run_results() to wait on and retrieve the - inference results. - - Raises - ------ - InferenceServerException - If no asynchronous request is in flight or completed. - - """ - # Get async run results - c_request_id = c_uint64() - err = c_void_p(_crequest_infer_ctx_get_ready_async_request( - self._ctx, byref(c_request_id), wait)) - - if not wait: - isunavailable = _crequest_error_isunavailable(err) - if isunavailable: - _crequest_error_del(err) - return None - - _raise_if_error(err) - - return c_request_id.value - - def get_last_request_id(self): - """Get the request ID of the most recent run() request. - - Returns - ------- - int - The request ID, or None if a request has not yet been made - or if the last request was not successful. - - """ - return self._last_request_id - - def get_last_request_model_name(self): - """Get the model name used in the most recent run() request. - - Returns - ------- - str - The model name, or None if a request has not yet been made - or if the last request was not successful. - - """ - return self._last_request_model_name - - def get_last_request_model_version(self): - """Get the model version used in the most recent run() request. - - Returns - ------- - int - The model version, or None if a request has not yet been made - or if the last request was not successful. - - """ - return self._last_request_model_version diff --git a/src/clients/python/build_pip.sh b/src/clients/python/build_pip.sh deleted file mode 100755 index 9b1b1837d4..0000000000 --- a/src/clients/python/build_pip.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#!/bin/bash -# Copyright 2017 Google Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -function main() { - if [[ $# -lt 1 ]] ; then - echo "usage: $0 " - exit 1 - fi - - if [[ ! -d "bazel-bin/src/clients/python" ]]; then - echo "Could not find bazel-bin/src/clients/python" - exit 1 - fi - - if [[ ! -f "VERSION" ]]; then - echo "Could not find VERSION" - exit 1 - fi - - VERSION=`cat VERSION` - DEST="$1" - TMPDIR="$(mktemp -d)" - - echo $(date) : "=== Using tmpdir: ${TMPDIR}" - mkdir -p ${TMPDIR}/tensorrtserver/api - - echo "Adding package files" - cp bazel-genfiles/src/core/*_pb2.py \ - "${TMPDIR}/tensorrtserver/api/." - - cp bazel-genfiles/src/core/*_grpc.py \ - "${TMPDIR}/tensorrtserver/api/." - - cp bazel-bin/src/clients/python/libcrequest.so \ - "${TMPDIR}/tensorrtserver/api/." - - cp src/clients/python/__init__.py \ - "${TMPDIR}/tensorrtserver/api/." - - cp src/clients/python/setup.py "${TMPDIR}" - touch ${TMPDIR}/tensorrtserver/__init__.py - - # Use 'sed' command to fix protoc compiled imports (see - # https://github.com/google/protobuf/issues/1491). - sed -i "s/^from src\.core import \([^ ]*\)_pb2 as \([^ ]*\)$/from tensorrtserver.api import \1_pb2 as \2/" \ - ${TMPDIR}/tensorrtserver/api/*_pb2.py - sed -i "s/^from src\.core import \([^ ]*\)_pb2 as \([^ ]*\)$/from tensorrtserver.api import \1_pb2 as \2/" \ - ${TMPDIR}/tensorrtserver/api/*_pb2_grpc.py - - pushd "${TMPDIR}" - echo $(date) : "=== Building wheel" - VERSION=$VERSION python${PYVER} setup.py bdist_wheel # >/dev/null - mkdir -p "${DEST}" - cp dist/* "${DEST}" - popd - rm -rf "${TMPDIR}" - echo $(date) : "=== Output wheel file is in: ${DEST}" -} - -main "$@" diff --git a/src/clients/python/crequest.cc b/src/clients/python/crequest.cc deleted file mode 100644 index 75741adac9..0000000000 --- a/src/clients/python/crequest.cc +++ /dev/null @@ -1,577 +0,0 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "src/clients/python/crequest.h" - -#include - -namespace ni = nvidia::inferenceserver; -namespace nic = nvidia::inferenceserver::client; - -//============================================================================== -nic::Error* -ErrorNew(const char* msg) -{ - return new nic::Error(ni::RequestStatusCode::INTERNAL, std::string(msg)); -} - -void -ErrorDelete(nic::Error* ctx) -{ - delete ctx; -} - -bool -ErrorIsOk(nic::Error* ctx) -{ - return ctx->IsOk(); -} - -bool -ErrorIsUnavailable(nic::Error* ctx) -{ - return (ctx->Code() == ni::RequestStatusCode::UNAVAILABLE); -} - -const char* -ErrorMessage(nic::Error* ctx) -{ - return ctx->Message().c_str(); -} - -const char* -ErrorServerId(nic::Error* ctx) -{ - return ctx->ServerId().c_str(); -} - -uint64_t -ErrorRequestId(nic::Error* ctx) -{ - return ctx->RequestId(); -} - -//============================================================================== -namespace { - -enum ProtocolType { HTTP = 0, GRPC = 1 }; - -nic::Error -ParseProtocol(ProtocolType* protocol, const int protocol_int) -{ - if (protocol_int == 0) { - *protocol = ProtocolType::HTTP; - return nic::Error::Success; - } else if (protocol_int == 1) { - *protocol = ProtocolType::GRPC; - return nic::Error::Success; - } - return nic::Error( - ni::RequestStatusCode::INVALID_ARG, - "unexpected protocol integer, expecting 0 for HTTP or 1 for gRPC"); -} - -} // namespace - -//============================================================================== -struct ServerHealthContextCtx { - std::unique_ptr ctx; -}; - -nic::Error* -ServerHealthContextNew( - ServerHealthContextCtx** ctx, const char* url, int protocol_int, bool verbose) -{ - nic::Error err; - ProtocolType protocol; - err = ParseProtocol(&protocol, protocol_int); - if (err.IsOk()) { - ServerHealthContextCtx* lctx = new ServerHealthContextCtx; - if (protocol == ProtocolType::HTTP) { - err = nic::ServerHealthHttpContext::Create( - &(lctx->ctx), std::string(url), verbose); - } else { - err = nic::ServerHealthGrpcContext::Create( - &(lctx->ctx), std::string(url), verbose); - } - - if (err.IsOk()) { - *ctx = lctx; - return nullptr; - } - - delete lctx; - } - - *ctx = nullptr; - return new nic::Error(err); -} - -void -ServerHealthContextDelete(ServerHealthContextCtx* ctx) -{ - delete ctx; -} - -nic::Error* -ServerHealthContextGetReady(ServerHealthContextCtx* ctx, bool* ready) -{ - nic::Error err = ctx->ctx->GetReady(ready); - if (err.IsOk()) { - return nullptr; - } - - return new nic::Error(err); -} - -nic::Error* -ServerHealthContextGetLive(ServerHealthContextCtx* ctx, bool* live) -{ - nic::Error err = ctx->ctx->GetLive(live); - if (err.IsOk()) { - return nullptr; - } - - return new nic::Error(err); -} - -//============================================================================== -struct ServerStatusContextCtx { - std::unique_ptr ctx; - std::string status_buf; -}; - -nic::Error* -ServerStatusContextNew( - ServerStatusContextCtx** ctx, const char* url, int protocol_int, - const char* model_name, bool verbose) -{ - nic::Error err; - ProtocolType protocol; - err = ParseProtocol(&protocol, protocol_int); - if (err.IsOk()) { - ServerStatusContextCtx* lctx = new ServerStatusContextCtx; - if (model_name == nullptr) { - if (protocol == ProtocolType::HTTP) { - err = nic::ServerStatusHttpContext::Create( - &(lctx->ctx), std::string(url), verbose); - } else { - err = nic::ServerStatusGrpcContext::Create( - &(lctx->ctx), std::string(url), verbose); - } - } else { - if (protocol == ProtocolType::HTTP) { - err = nic::ServerStatusHttpContext::Create( - &(lctx->ctx), std::string(url), std::string(model_name), verbose); - } else { - err = nic::ServerStatusGrpcContext::Create( - &(lctx->ctx), std::string(url), std::string(model_name), verbose); - } - } - - if (err.IsOk()) { - *ctx = lctx; - return nullptr; - } - - delete lctx; - } - - *ctx = nullptr; - return new nic::Error(err); -} - -void -ServerStatusContextDelete(ServerStatusContextCtx* ctx) -{ - delete ctx; -} - -nic::Error* -ServerStatusContextGetServerStatus( - ServerStatusContextCtx* ctx, char** status, uint32_t* status_len) -{ - ctx->status_buf.clear(); - - ni::ServerStatus server_status; - nic::Error err = ctx->ctx->GetServerStatus(&server_status); - if (err.IsOk()) { - if (server_status.SerializeToString(&ctx->status_buf)) { - *status = &ctx->status_buf[0]; - *status_len = ctx->status_buf.size(); - } else { - err = nic::Error( - ni::RequestStatusCode::INTERNAL, "failed to parse server status"); - } - } - - return new nic::Error(err); -} - -//============================================================================== -struct InferContextCtx { - std::unique_ptr ctx; - std::vector> results; - std::vector> requests; -}; - -nic::Error* -InferContextNew( - InferContextCtx** ctx, const char* url, int protocol_int, - const char* model_name, int model_version, bool verbose) -{ - nic::Error err; - ProtocolType protocol; - err = ParseProtocol(&protocol, protocol_int); - if (err.IsOk()) { - InferContextCtx* lctx = new InferContextCtx; - if (protocol == ProtocolType::HTTP) { - err = nic::InferHttpContext::Create( - &(lctx->ctx), std::string(url), std::string(model_name), model_version, - verbose); - } else { - err = nic::InferGrpcContext::Create( - &(lctx->ctx), std::string(url), std::string(model_name), model_version, - verbose); - } - - if (err.IsOk()) { - *ctx = lctx; - return nullptr; - } - delete lctx; - } - - *ctx = nullptr; - return new nic::Error(err); -} - -void -InferContextDelete(InferContextCtx* ctx) -{ - delete ctx; -} - -nic::Error* -InferContextSetOptions( - InferContextCtx* ctx, nic::InferContext::Options* options) -{ - nic::Error err = ctx->ctx->SetRunOptions(*options); - return new nic::Error(err); -} - -nic::Error* -InferContextRun(InferContextCtx* ctx) -{ - ctx->results.clear(); - nic::Error err = ctx->ctx->Run(&ctx->results); - return new nic::Error(err); -} - -nic::Error* -InferContextAsyncRun(InferContextCtx* ctx, size_t* request_id) -{ - std::shared_ptr request; - nic::Error err = ctx->ctx->AsyncRun(&request); - ctx->requests.push_back(request); - *request_id = request->Id(); - return new nic::Error(err); -} - -nic::Error* -InferContextGetAsyncRunResults( - InferContextCtx* ctx, size_t request_id, bool wait) -{ - for (auto itr = ctx->requests.begin(); itr != ctx->requests.end(); itr++) { - if ((*itr)->Id() == request_id) { - ctx->results.clear(); - nic::Error err = ctx->ctx->GetAsyncRunResults(&ctx->results, *itr, wait); - if (err.IsOk()) { - ctx->requests.erase(itr); - } - return new nic::Error(err); - } - } - return new nic::Error( - ni::RequestStatusCode::INVALID_ARG, - "The request ID doesn't match any existing asynchrnous requests"); -} - -nic::Error* -InferContextGetReadyAsyncRequest( - InferContextCtx* ctx, size_t* request_id, bool wait) -{ - // Here we assume that all asynchronous request is created by calling - // InferContextAsyncRun(). Thus we don't need to check ctx->requests. - std::shared_ptr request; - nic::Error err = ctx->ctx->GetReadyAsyncRequest(&request, wait); - *request_id = request->Id(); - return new nic::Error(err); -} - -//============================================================================== -nic::Error* -InferContextOptionsNew(nic::InferContext::Options** ctx, uint64_t batch_size) -{ - std::unique_ptr uctx; - nic::Error err = nic::InferContext::Options::Create(&uctx); - if (err.IsOk()) { - *ctx = uctx.release(); - (*ctx)->SetBatchSize(batch_size); - return nullptr; - } - - *ctx = nullptr; - return new nic::Error(err); -} - -void -InferContextOptionsDelete(nic::InferContext::Options* ctx) -{ - delete ctx; -} - -nic::Error* -InferContextOptionsAddRaw( - InferContextCtx* infer_ctx, nic::InferContext::Options* ctx, - const char* output_name) -{ - std::shared_ptr output; - nic::Error err = infer_ctx->ctx->GetOutput(std::string(output_name), &output); - if (err.IsOk()) { - err = ctx->AddRawResult(output); - } - - return new nic::Error(err); -} - -nic::Error* -InferContextOptionsAddClass( - InferContextCtx* infer_ctx, nic::InferContext::Options* ctx, - const char* output_name, uint64_t count) -{ - std::shared_ptr output; - nic::Error err = infer_ctx->ctx->GetOutput(std::string(output_name), &output); - if (err.IsOk()) { - err = ctx->AddClassResult(output, count); - } - - return new nic::Error(err); -} - -//============================================================================== -struct InferContextInputCtx { - std::shared_ptr input; -}; - -nic::Error* -InferContextInputNew( - InferContextInputCtx** ctx, InferContextCtx* infer_ctx, - const char* input_name) -{ - InferContextInputCtx* lctx = new InferContextInputCtx; - nic::Error err = - infer_ctx->ctx->GetInput(std::string(input_name), &lctx->input); - if (err.IsOk()) { - lctx->input->Reset(); - } - *ctx = lctx; - return new nic::Error(err); -} - -void -InferContextInputDelete(InferContextInputCtx* ctx) -{ - delete ctx; -} - -nic::Error* -InferContextInputSetRaw( - InferContextInputCtx* ctx, const void* data, uint64_t byte_size) -{ - nic::Error err = - ctx->input->SetRaw(reinterpret_cast(data), byte_size); - return new nic::Error(err); -} - -//============================================================================== -struct InferContextResultCtx { - std::unique_ptr result; - nic::InferContext::Result::ClassResult cr; -}; - -nic::Error* -InferContextResultNew( - InferContextResultCtx** ctx, InferContextCtx* infer_ctx, - const char* result_name) -{ - InferContextResultCtx* lctx = new InferContextResultCtx; - for (auto& r : infer_ctx->results) { - if ((r != nullptr) && (r->GetOutput()->Name() == result_name)) { - lctx->result.swap(r); - break; - } - } - - if (lctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, - "unable to find result for output '" + std::string(result_name) + "'"); - } - - *ctx = lctx; - return nullptr; -} - -void -InferContextResultDelete(InferContextResultCtx* ctx) -{ - delete ctx; -} - -nic::Error* -InferContextResultModelName(InferContextResultCtx* ctx, const char** model_name) -{ - if (ctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, - "model name not available for empty result"); - } - - *model_name = ctx->result->ModelName().c_str(); - return nullptr; -} - -nic::Error* -InferContextResultModelVersion( - InferContextResultCtx* ctx, uint32_t* model_version) -{ - if (ctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, - "model version not available for empty result"); - } - - *model_version = ctx->result->ModelVersion(); - return nullptr; -} - -nic::Error* -InferContextResultDataType(InferContextResultCtx* ctx, uint32_t* dtype) -{ - if (ctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, - "datatype not available for empty result"); - } - - ni::DataType data_type = ctx->result->GetOutput()->DType(); - *dtype = static_cast(data_type); - - return nullptr; -} - -nic::Error* -InferContextResultDims( - InferContextResultCtx* ctx, uint64_t max_dims, uint32_t* shape, - uint64_t* shape_len) -{ - if (ctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, "dims not available for empty result"); - } - - const ni::DimsList& dims = ctx->result->GetOutput()->Dims(); - if (static_cast(dims.size()) > max_dims) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, - "number of result dims exceeds maximum of " + std::to_string(max_dims)); - } - - size_t cnt = 0; - for (auto dim : dims) { - shape[cnt++] = static_cast(dim); - } - - *shape_len = dims.size(); - - return nullptr; -} - -nic::Error* -InferContextResultNextRaw( - InferContextResultCtx* ctx, size_t batch_idx, const char** val, - uint64_t* val_len) -{ - if (ctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, - "no raw result available for empty result"); - } - - const std::vector* buf; - nic::Error err = ctx->result->GetRaw(batch_idx, &buf); - if (err.IsOk()) { - *val = reinterpret_cast(&((*buf)[0])); - *val_len = buf->size(); - } - - return new nic::Error(err); -} - -nic::Error* -InferContextResultClassCount( - InferContextResultCtx* ctx, size_t batch_idx, uint64_t* count) -{ - if (ctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, "no classes available for empty result"); - } - - nic::Error err = ctx->result->GetClassCount(batch_idx, count); - return new nic::Error(err); -} - -nic::Error* -InferContextResultNextClass( - InferContextResultCtx* ctx, size_t batch_idx, uint64_t* idx, float* prob, - const char** label) -{ - if (ctx->result == nullptr) { - return new nic::Error( - ni::RequestStatusCode::INTERNAL, "no classes available for empty result"); - } - - nic::Error err = ctx->result->GetClassAtCursor(batch_idx, &ctx->cr); - if (err.IsOk()) { - auto& cr = ctx->cr; - *idx = cr.idx; - *prob = cr.value; - *label = cr.label.c_str(); - } - - return new nic::Error(err); -} diff --git a/src/clients/python/crequest.h b/src/clients/python/crequest.h deleted file mode 100644 index e2d2cc5131..0000000000 --- a/src/clients/python/crequest.h +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// * Neither the name of NVIDIA CORPORATION nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once - -#include -#include "src/clients/c++/request.h" - -namespace nic = nvidia::inferenceserver::client; - -#ifdef __cplusplus -extern "C" { -#endif - - -//============================================================================== -// Error -nic::Error* ErrorNew(const char* msg); -void ErrorDelete(nic::Error* ctx); -bool ErrorIsOk(nic::Error* ctx); -bool ErrorIsUnavailable(nic::Error* ctx); -const char* ErrorMessage(nic::Error* ctx); -const char* ErrorServerId(nic::Error* ctx); -uint64_t ErrorRequestId(nic::Error* ctx); - -//============================================================================== -// ServerHealthContext -typedef struct ServerHealthContextCtx ServerHealthContextCtx; -nic::Error* ServerHealthContextNew( - ServerHealthContextCtx** ctx, const char* url, int protocol_int, - bool verbose); -void ServerHealthContextDelete(ServerHealthContextCtx* ctx); -nic::Error* ServerHealthContextGetReady( - ServerHealthContextCtx* ctx, bool* ready); -nic::Error* ServerHealthContextGetLive(ServerHealthContextCtx* ctx, bool* live); - -//============================================================================== -// ServerStatusContext -typedef struct ServerStatusContextCtx ServerStatusContextCtx; -nic::Error* ServerStatusContextNew( - ServerStatusContextCtx** ctx, const char* url, int protocol_int, - const char* model_name, bool verbose); -void ServerStatusContextDelete(ServerStatusContextCtx* ctx); -nic::Error* ServerStatusContextGetServerStatus( - ServerStatusContextCtx* ctx, char** status, uint32_t* status_len); - -//============================================================================== -// InferContext -typedef struct InferContextCtx InferContextCtx; -nic::Error* InferContextNew( - InferContextCtx** ctx, const char* url, int protocol_int, - const char* model_name, int model_version, bool verbose); -void InferContextDelete(InferContextCtx* ctx); -nic::Error* InferContextSetOptions( - InferContextCtx* ctx, nic::InferContext::Options* options); -nic::Error* InferContextRun(InferContextCtx* ctx); -nic::Error* InferContextAsyncRun(InferContextCtx* ctx, size_t* request_id); -nic::Error* InferContextGetAsyncRunResults( - InferContextCtx* ctx, size_t request_id, bool wait); -nic::Error* InferContextGetReadyAsyncRequest( - InferContextCtx* ctx, size_t* request_id, bool wait); - -//============================================================================== -// InferContext::Options -nic::Error* InferContextOptionsNew( - nic::InferContext::Options** ctx, uint64_t batch_size); -void InferContextOptionsDelete(nic::InferContext::Options* ctx); -nic::Error* InferContextOptionsAddRaw( - InferContextCtx* infer_ctx, nic::InferContext::Options* ctx, - const char* output_name); -nic::Error* InferContextOptionsAddClass( - InferContextCtx* infer_ctx, nic::InferContext::Options* ctx, - const char* output_name, uint64_t count); - -//============================================================================== -// InferContext::Input -typedef struct InferContextInputCtx InferContextInputCtx; -nic::Error* InferContextInputNew( - InferContextInputCtx** ctx, InferContextCtx* infer_ctx, - const char* input_name); -void InferContextInputDelete(InferContextInputCtx* ctx); -nic::Error* InferContextInputSetRaw( - InferContextInputCtx* ctx, const void* data, uint64_t byte_size); - -//============================================================================== -// InferContext::Result -typedef struct InferContextResultCtx InferContextResultCtx; -nic::Error* InferContextResultNew( - InferContextResultCtx** ctx, InferContextCtx* infer_ctx, - const char* result_name); -void InferContextResultDelete(InferContextResultCtx* ctx); -nic::Error* InferContextResultModelName( - InferContextResultCtx* ctx, const char** model_name); -nic::Error* InferContextResultModelVersion( - InferContextResultCtx* ctx, uint32_t* model_version); -nic::Error* InferContextResultDataType( - InferContextResultCtx* ctx, uint32_t* dtype); -nic::Error* InferContextResultDims( - InferContextResultCtx* ctx, uint64_t max_dims, uint32_t* shape, - uint64_t* shape_len); -nic::Error* InferContextResultNextRaw( - InferContextResultCtx* ctx, size_t batch_idx, const char** val, - uint64_t* val_len); -nic::Error* InferContextResultClassCount( - InferContextResultCtx* ctx, size_t batch_idx, uint64_t* count); -nic::Error* InferContextResultNextClass( - InferContextResultCtx* ctx, size_t batch_idx, uint64_t* idx, float* prob, - const char** label); - -#ifdef __cplusplus -} -#endif diff --git a/src/clients/python/grpc_image_client.py b/src/clients/python/grpc_image_client.py deleted file mode 100755 index 25862b4c9d..0000000000 --- a/src/clients/python/grpc_image_client.py +++ /dev/null @@ -1,315 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import numpy as np -import os -from builtins import range -from PIL import Image - -import grpc -from tensorrtserver.api import api_pb2 -from tensorrtserver.api import grpc_service_pb2 -from tensorrtserver.api import grpc_service_pb2_grpc -import tensorrtserver.api.model_config_pb2 as model_config - -FLAGS = None - -def model_dtype_to_np(model_dtype): - - if model_dtype == model_config.TYPE_BOOL: - return np.bool - elif model_dtype == model_config.TYPE_INT8: - return np.int8 - elif model_dtype == model_config.TYPE_INT16: - return np.int16 - elif model_dtype == model_config.TYPE_INT32: - return np.int32 - elif model_dtype == model_config.TYPE_INT64: - return np.int64 - elif model_dtype == model_config.TYPE_UINT8: - return np.uint8 - elif model_dtype == model_config.TYPE_UINT16: - return np.uint16 - elif model_dtype == model_config.TYPE_FP16: - return np.float16 - elif model_dtype == model_config.TYPE_FP32: - return np.float32 - elif model_dtype == model_config.TYPE_FP64: - return np.float64 - return None - -def parse_model(status, model_name, batch_size, verbose=False): - """ - Check the configuration of a model to make sure it meets the - requirements for an image classification network (as expected by - this client) - """ - server_status = status.server_status - if model_name not in server_status.model_status.keys(): - raise Exception("unable to get status for '" + model_name + "'") - - status = server_status.model_status[model_name] - config = status.config - - if len(config.input) != 1: - raise Exception("expecting 1 input, got {}".format(len(config.input))) - if len(config.output) != 1: - raise Exception("expecting 1 output, got {}".format(len(config.output))) - - input = config.input[0] - output = config.output[0] - - if output.data_type != model_config.TYPE_FP32: - raise Exception("expecting output datatype to be TYPE_FP32, model '" + - model_name + "' output type is " + - model_config.DataType.Name(output.data_type)) - - # Output is expected to be a vector. But allow any number of - # dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10 - # }, { 10, 1, 1 } are all ok). - non_one_cnt = 0 - for dim in output.dims: - if dim > 1: - non_one_cnt += 1 - if non_one_cnt > 1: - raise Exception("expecting model output to be a vector") - - # Model specifying maximum batch size of 0 indicates that batching - # is not supported and so the input tensors do not expect an "N" - # dimension (and 'batch_size' should be 1 so that only a single - # image instance is inferred at a time). - max_batch_size = config.max_batch_size - if max_batch_size == 0: - if batch_size != 1: - raise Exception("batching not supported for model '" + model_name + "'") - else: # max_batch_size > 0 - if batch_size > max_batch_size: - raise Exception( - "expecting batch size <= {} for model '{}'".format(max_batch_size, model_name)) - - # Model input must have 3 dims, either CHW or HWC - if len(input.dims) != 3: - raise Exception( - "expecting input to have 3 dimensions, model '{}' input has {}".format( - model_name, len(input.dims))) - - if ((input.format != model_config.ModelInput.FORMAT_NCHW) and - (input.format != model_config.ModelInput.FORMAT_NHWC)): - raise Exception("unexpected input format " + model_config.ModelInput.Format.Name(input.format) + - ", expecting " + - model_config.ModelInput.Format.Name(model_config.ModelInput.FORMAT_NCHW) + - " or " + - model_config.ModelInput.Format.Name(model_config.ModelInput.FORMAT_NHWC)) - - if input.format == model_config.ModelInput.FORMAT_NHWC: - h = input.dims[0] - w = input.dims[1] - c = input.dims[2] - else: - c = input.dims[0] - h = input.dims[1] - w = input.dims[2] - - output_size = 1 - for dim in output.dims: - output_size = output_size * dim - output_size = output_size * np.dtype(model_dtype_to_np(output.data_type)).itemsize - - return (input.name, output.name, c, h, w, input.format, model_dtype_to_np(input.data_type), output_size) - -def preprocess(img, format, dtype, c, h, w, scaling): - """ - Pre-process an image to meet the size, type and format - requirements specified by the parameters. - """ - #np.set_printoptions(threshold='nan') - - if c == 1: - sample_img = img.convert('L') - else: - sample_img = img.convert('RGB') - - resized_img = sample_img.resize((h, w), Image.BILINEAR) - resized = np.array(resized_img) - if resized.ndim == 2: - resized = resized[:,:,np.newaxis] - - typed = resized.astype(dtype) - - if scaling == 'INCEPTION': - scaled = (typed / 128) - 1 - elif scaling == 'VGG': - if c == 1: - scaled = typed - np.asarray((128,), dtype=dtype) - else: - scaled = typed - np.asarray((123, 117, 104), dtype=dtype) - else: - scaled = typed - - # Swap to CHW if necessary - if format == model_config.ModelInput.FORMAT_NCHW: - ordered = np.transpose(scaled, (2, 0, 1)) - else: - ordered = scaled - - # Channels are in RGB order. Currently model configuration data - # doesn't provide any information as to other channel orderings - # (like BGR) so we just assume RGB. - return ordered - -def postprocess(results, filenames, batch_size): - """ - Post-process results to show classifications. - """ - if len(results) != 1: - raise Exception("expected 1 result, got {}".format(len(results))) - - batched_result = results[0].batch_classes - if len(batched_result) != batch_size: - raise Exception("expected {} results, got {}".format(batch_size, len(batched_result))) - if len(filenames) != batch_size: - raise Exception("expected {} filenames, got {}".format(batch_size, len(filenames))) - - for (index, result) in enumerate(batched_result): - print("Image '{}':".format(filenames[index])) - for cls in result.cls: - print(" {} ({}) = {}".format(cls.idx, cls.label, cls.value)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False, - help='Enable verbose output') - parser.add_argument('-a', '--async', action="store_true", required=False, default=False, - help='Use asynchronous inference API') - parser.add_argument('-m', '--model-name', type=str, required=True, - help='Name of model') - parser.add_argument('-x', '--model-version', type=int, required=False, - help='Version of model. Default is to use latest version.') - parser.add_argument('-b', '--batch-size', type=int, required=False, default=1, - help='Batch size. Default is 1.') - parser.add_argument('-c', '--classes', type=int, required=False, default=1, - help='Number of class results to report. Default is 1.') - parser.add_argument('-s', '--scaling', type=str, choices=['NONE', 'INCEPTION', 'VGG'], - required=False, default='NONE', - help='Type of scaling to apply to image pixels. Default is NONE.') - parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8001', - help='Inference server URL. Default is localhost:8001.') - parser.add_argument('image_filename', type=str, nargs='?', default=None, - help='Input image.') - FLAGS = parser.parse_args() - - # Create gRPC stub for communicating with the server - channel = grpc.insecure_channel(FLAGS.url) - grpc_stub = grpc_service_pb2_grpc.GRPCServiceStub(channel) - - # Prepare request for Status gRPC - request = grpc_service_pb2.StatusRequest(model_name=FLAGS.model_name) - # Call and receive response from Status gRPC - response = grpc_stub.Status(request) - # Make sure the model matches our requirements, and get some - # properties of the model that we need for preprocessing - input_name, output_name, c, h, w, format, dtype, output_size = parse_model( - response, FLAGS.model_name, FLAGS.batch_size, FLAGS.verbose) - - # Prepare request for Infer gRPC - # The meta data part can be reused across requests - request = grpc_service_pb2.InferRequest() - request.model_name = FLAGS.model_name - if FLAGS.model_version is None: - request.version = -1 - else: - request.version = FLAGS.model_version - request.meta_data.batch_size = FLAGS.batch_size - output_message = api_pb2.InferRequestHeader.Output() - output_message.name = output_name - output_message.byte_size = output_size - output_message.cls.count = FLAGS.classes - request.meta_data.output.extend([output_message]) - - filenames = [] - if os.path.isdir(FLAGS.image_filename): - filenames = [os.path.join(FLAGS.image_filename, f) - for f in os.listdir(FLAGS.image_filename) - if os.path.isfile(os.path.join(FLAGS.image_filename, f))] - else: - filenames = [FLAGS.image_filename,] - - filenames.sort() - - # Preprocess the images into input data according to model - # requirements - image_data = [] - for filename in filenames: - img = Image.open(filename) - image_data.append(preprocess(img, format, dtype, c, h, w, FLAGS.scaling)) - - request.meta_data.input.add( - name=input_name, byte_size=image_data[0].size * image_data[0].itemsize) - - # Send requests of FLAGS.batch_size images. If the number of - # images isn't an exact multiple of FLAGS.batch_size then just - # start over with the first images until the batch is filled. - result_filenames = [] - requests = [] - responses = [] - image_idx = 0 - last_request = False - while not last_request: - input_bytes = None - input_filenames = [] - del request.raw_input[:] - for idx in range(FLAGS.batch_size): - input_filenames.append(filenames[image_idx]) - if input_bytes is None: - input_bytes = image_data[image_idx].tobytes() - else: - input_bytes += image_data[image_idx].tobytes() - - image_idx = (image_idx + 1) % len(image_data) - if image_idx == 0: - last_request = True - - request.raw_input.extend([input_bytes]) - result_filenames.append(input_filenames) - - # Send request - if not FLAGS.async: - responses.append(grpc_stub.Infer(request)) - else: - requests.append(grpc_stub.Infer.future(request)) - - # For async, retrieve results according to the send order - if FLAGS.async: - for request in requests: - responses.append(request.result()) - - for idx in range(len(responses)): - print("Request {}, batch size {}".format(idx, FLAGS.batch_size)) - postprocess(responses[idx].meta_data.output, result_filenames[idx], FLAGS.batch_size) diff --git a/src/clients/python/image_client.py b/src/clients/python/image_client.py deleted file mode 100755 index c3254d5036..0000000000 --- a/src/clients/python/image_client.py +++ /dev/null @@ -1,289 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import numpy as np -import os -from builtins import range -from PIL import Image -from tensorrtserver.api import * -import tensorrtserver.api.model_config_pb2 as model_config - -FLAGS = None - -def model_dtype_to_np(model_dtype): - if model_dtype == model_config.TYPE_BOOL: - return np.bool - elif model_dtype == model_config.TYPE_INT8: - return np.int8 - elif model_dtype == model_config.TYPE_INT16: - return np.int16 - elif model_dtype == model_config.TYPE_INT32: - return np.int32 - elif model_dtype == model_config.TYPE_INT64: - return np.int64 - elif model_dtype == model_config.TYPE_UINT8: - return np.uint8 - elif model_dtype == model_config.TYPE_UINT16: - return np.uint16 - elif model_dtype == model_config.TYPE_FP16: - return np.float16 - elif model_dtype == model_config.TYPE_FP32: - return np.float32 - elif model_dtype == model_config.TYPE_FP64: - return np.float64 - return None - -def parse_model(url, protocol, model_name, batch_size, verbose=False): - """ - Check the configuration of a model to make sure it meets the - requirements for an image classification network (as expected by - this client) - """ - ctx = ServerStatusContext(url, protocol, model_name, verbose) - server_status = ctx.get_server_status() - - if model_name not in server_status.model_status: - raise Exception("unable to get status for '" + model_name + "'") - - status = server_status.model_status[model_name] - config = status.config - - if len(config.input) != 1: - raise Exception("expecting 1 input, got {}".format(len(config.input))) - if len(config.output) != 1: - raise Exception("expecting 1 output, got {}".format(len(config.output))) - - input = config.input[0] - output = config.output[0] - - if output.data_type != model_config.TYPE_FP32: - raise Exception("expecting output datatype to be TYPE_FP32, model '" + - model_name + "' output type is " + - model_config.DataType.Name(output.data_type)) - - # Output is expected to be a vector. But allow any number of - # dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10 - # }, { 10, 1, 1 } are all ok). - non_one_cnt = 0 - for dim in output.dims: - if dim > 1: - non_one_cnt += 1 - if non_one_cnt > 1: - raise Exception("expecting model output to be a vector") - - # Model specifying maximum batch size of 0 indicates that batching - # is not supported and so the input tensors do not expect an "N" - # dimension (and 'batch_size' should be 1 so that only a single - # image instance is inferred at a time). - max_batch_size = config.max_batch_size - if max_batch_size == 0: - if batch_size != 1: - raise Exception("batching not supported for model '" + model_name + "'") - else: # max_batch_size > 0 - if batch_size > max_batch_size: - raise Exception("expecting batch size <= {} for model {}".format(max_batch_size, model_name)) - - # Model input must have 3 dims, either CHW or HWC - if len(input.dims) != 3: - raise Exception( - "expecting input to have 3 dimensions, model '{}' input has {}".format( - model_name, len(input.dims))) - - if ((input.format != model_config.ModelInput.FORMAT_NCHW) and - (input.format != model_config.ModelInput.FORMAT_NHWC)): - raise Exception("unexpected input format " + model_config.ModelInput.Format.Name(input.format) + - ", expecting " + - model_config.ModelInput.Format.Name(model_config.ModelInput.FORMAT_NCHW) + - " or " + - model_config.ModelInput.Format.Name(model_config.ModelInput.FORMAT_NHWC)) - - if input.format == model_config.ModelInput.FORMAT_NHWC: - h = input.dims[0] - w = input.dims[1] - c = input.dims[2] - else: - c = input.dims[0] - h = input.dims[1] - w = input.dims[2] - - return (input.name, output.name, c, h, w, input.format, model_dtype_to_np(input.data_type)) - -def preprocess(img, format, dtype, c, h, w, scaling): - """ - Pre-process an image to meet the size, type and format - requirements specified by the parameters. - """ - #np.set_printoptions(threshold='nan') - - if c == 1: - sample_img = img.convert('L') - else: - sample_img = img.convert('RGB') - - resized_img = sample_img.resize((h, w), Image.BILINEAR) - resized = np.array(resized_img) - if resized.ndim == 2: - resized = resized[:,:,np.newaxis] - - typed = resized.astype(dtype) - - if scaling == 'INCEPTION': - scaled = (typed / 128) - 1 - elif scaling == 'VGG': - if c == 1: - scaled = typed - np.asarray((128,), dtype=dtype) - else: - scaled = typed - np.asarray((123, 117, 104), dtype=dtype) - else: - scaled = typed - - # Swap to CHW if necessary - if format == model_config.ModelInput.FORMAT_NCHW: - ordered = np.transpose(scaled, (2, 0, 1)) - else: - ordered = scaled - - # Channels are in RGB order. Currently model configuration data - # doesn't provide any information as to other channel orderings - # (like BGR) so we just assume RGB. - return ordered - -def postprocess(results, filenames, batch_size): - """ - Post-process results to show classifications. - """ - if len(results) != 1: - raise Exception("expected 1 result, got {}".format(len(results))) - - batched_result = list(results.values())[0] - if len(batched_result) != batch_size: - raise Exception("expected {} results, got {}".format(batch_size, len(batched_result))) - if len(filenames) != batch_size: - raise Exception("expected {} filenames, got {}".format(batch_size, len(filenames))) - - for (index, result) in enumerate(batched_result): - print("Image '{}':".format(filenames[index])) - for cls in result: - print(" {} ({}) = {}".format(cls[0], cls[2], cls[1])) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False, - help='Enable verbose output') - parser.add_argument('-a', '--async', action="store_true", required=False, default=False, - help='Use asynchronous inference API') - parser.add_argument('-m', '--model-name', type=str, required=True, - help='Name of model') - parser.add_argument('-x', '--model-version', type=int, required=False, - help='Version of model. Default is to use latest version.') - parser.add_argument('-b', '--batch-size', type=int, required=False, default=1, - help='Batch size. Default is 1.') - parser.add_argument('-c', '--classes', type=int, required=False, default=1, - help='Number of class results to report. Default is 1.') - parser.add_argument('-s', '--scaling', type=str, choices=['NONE', 'INCEPTION', 'VGG'], - required=False, default='NONE', - help='Type of scaling to apply to image pixels. Default is NONE.') - parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8000', - help='Inference server URL. Default is localhost:8000.') - parser.add_argument('-i', '--protocol', type=str, required=False, default='HTTP', - help='Protocol (HTTP/gRPC) used to ' + - 'communicate with inference service. Default is HTTP.') - parser.add_argument('image_filename', type=str, nargs='?', default=None, - help='Input image / Input folder.') - FLAGS = parser.parse_args() - - protocol = ProtocolType.from_str(FLAGS.protocol) - - # Make sure the model matches our requirements, and get some - # properties of the model that we need for preprocessing - input_name, output_name, c, h, w, format, dtype = parse_model( - FLAGS.url, protocol, FLAGS.model_name, - FLAGS.batch_size, FLAGS.verbose) - - ctx = InferContext(FLAGS.url, protocol, - FLAGS.model_name, FLAGS.model_version, FLAGS.verbose) - - filenames = [] - if os.path.isdir(FLAGS.image_filename): - filenames = [os.path.join(FLAGS.image_filename, f) - for f in os.listdir(FLAGS.image_filename) - if os.path.isfile(os.path.join(FLAGS.image_filename, f))] - else: - filenames = [FLAGS.image_filename,] - - filenames.sort() - - # Preprocess the images into input data according to model - # requirements - image_data = [] - for filename in filenames: - img = Image.open(filename) - image_data.append(preprocess(img, format, dtype, c, h, w, FLAGS.scaling)) - - # Send requests of FLAGS.batch_size images. If the number of - # images isn't an exact multiple of FLAGS.batch_size then just - # start over with the first images until the batch is filled. - results = [] - result_filenames = [] - request_ids = [] - image_idx = 0 - last_request = False - while not last_request: - input_filenames = [] - input_batch = [] - for idx in range(FLAGS.batch_size): - input_filenames.append(filenames[image_idx]) - input_batch.append(image_data[image_idx]) - image_idx = (image_idx + 1) % len(image_data) - if image_idx == 0: - last_request = True - - result_filenames.append(input_filenames) - - # Send request - if not FLAGS.async: - results.append(ctx.run( - { input_name : input_batch }, - { output_name : (InferContext.ResultFormat.CLASS, FLAGS.classes) }, - FLAGS.batch_size)) - else: - request_ids.append(ctx.async_run( - { input_name : input_batch }, - { output_name : (InferContext.ResultFormat.CLASS, FLAGS.classes) }, - FLAGS.batch_size)) - - # For async, retrieve results according to the send order - if FLAGS.async: - for request_id in request_ids: - results.append(ctx.get_async_run_results(request_id, True)) - - for idx in range(len(results)): - print("Request {}, batch size {}".format(idx, FLAGS.batch_size)) - postprocess(results[idx], result_filenames[idx], FLAGS.batch_size) diff --git a/src/clients/python/setup.py b/src/clients/python/setup.py deleted file mode 100644 index 7a105561b5..0000000000 --- a/src/clients/python/setup.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -from setuptools import find_packages -from setuptools import setup, dist - -if 'VERSION' not in os.environ: - raise Exception('envvar VERSION must be specified') - -VERSION = os.environ['VERSION'] - -REQUIRED = [ - 'future', - 'numpy', - 'protobuf>=3.5.0', - 'grpcio' -] - -try: - from wheel.bdist_wheel import bdist_wheel as _bdist_wheel - class bdist_wheel(_bdist_wheel): - def finalize_options(self): - _bdist_wheel.finalize_options(self) - self.root_is_pure = False -except ImportError: - bdist_wheel = None - -setup( - name='tensorrtserver', - version=VERSION, - author='NVIDIA Inc.', - author_email='davidg@nvidia.com', - description='Python client library for TensorRT Inference Server', - license='BSD', - url='http://nvidia.com', - keywords='tensorrt inference server service client', - packages=find_packages(), - install_requires=REQUIRED, - package_data={ - '': [ 'libcrequest.so', ], - }, - zip_safe=False, - cmdclass={'bdist_wheel': bdist_wheel}, -) diff --git a/src/clients/python/simple_client.py b/src/clients/python/simple_client.py deleted file mode 100644 index cdfdbfede1..0000000000 --- a/src/clients/python/simple_client.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/python - -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import numpy as np -import os -from builtins import range -from tensorrtserver.api import * - -FLAGS = None - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False, - help='Enable verbose output') - parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8000', - help='Inference server URL. Default is localhost:8000.') - parser.add_argument('-i', '--protocol', type=str, required=False, default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - - FLAGS = parser.parse_args() - protocol = ProtocolType.from_str(FLAGS.protocol) - - # We use a simple model that takes 2 input tensors of 16 integers - # each and returns 2 output tensors of 16 integers each. One - # output tensor is the element-wise sum of the inputs and one - # output is the element-wise difference. - model_name = "simple" - model_version = -1 - batch_size = 1 - - # Create the inference context for the model. - ctx = InferContext(FLAGS.url, protocol, model_name, model_version, FLAGS.verbose) - - # Create the data for the two input tensors. Initialize the first - # to unique integers and the second to all ones. - input0_data = np.arange(start=0, stop=16, dtype=np.int32) - input1_data = np.ones(shape=16, dtype=np.int32) - - # Send inference request to the inference server. Get results for - # both output tensors. - result = ctx.run({ 'INPUT0' : (input0_data,), - 'INPUT1' : (input1_data,) }, - { 'OUTPUT0' : InferContext.ResultFormat.RAW, - 'OUTPUT1' : InferContext.ResultFormat.RAW }, - batch_size) - - # We expect there to be 2 results (each with batch-size 1). Walk - # over all 16 result elements and print the sum and difference - # calculated by the model. - output0_data = result['OUTPUT0'][0] - output1_data = result['OUTPUT1'][0] - - for i in range(16): - print(str(input0_data[i]) + " + " + str(input1_data[i]) + " = " + str(output0_data[i])) - print(str(input0_data[i]) + " - " + str(input1_data[i]) + " = " + str(output1_data[i])) - if (input0_data[i] + input1_data[i]) != output0_data[i]: - print("error: incorrect sum"); - sys.exit(1); - if (input0_data[i] - input1_data[i]) != output1_data[i]: - print("error: incorrect difference"); - sys.exit(1); diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc new file mode 100644 index 0000000000..53a103d33b --- /dev/null +++ b/src/command_line_parser.cc @@ -0,0 +1,2386 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "command_line_parser.h" +constexpr const char* GLOBAL_OPTION_GROUP = ""; + +#ifdef _WIN32 +int optind = 1; +const char* optarg = nullptr; + +/// Implementation of `getopt_long` for Windows. +/// Linux uses available implementation: +/// https://github.com/gcc-mirror/gcc/blob/fab08d12b40ad637c5a4ce8e026fb43cd3f0fad1/include/getopt.h +/// and +/// https://github.com/gcc-mirror/gcc/blob/fab08d12b40ad637c5a4ce8e026fb43cd3f0fad1/libiberty/getopt.c#L521 +/// Parameters' description is available here: +/// https://github.com/gcc-mirror/gcc/blob/fab08d12b40ad637c5a4ce8e026fb43cd3f0fad1/libiberty/getopt.c#L464-L518 +/// `optind' is an index to iterate over `argv`, (whose length is `argc`), +/// and starts from 1, since argv[0] is the program name. +/// Text in the current `argv`-element is returned in `optarg'. +/// Note: if option was provided in the form of --=, then +/// optarg is (argv[optind] + found + 1), i.e. everything after `=`. +/// Alternatively, option can be provided as -- . +/// In this case, is storred as a separate parameter in `argv`. +/// `longind` returns the index in `longopts` of the long-named option found. + +int +getopt_long( + int argc, char* const argv[], const char* optstring, + const struct option* longopts, int* longind) +{ + if (optind >= argc) { + return -1; + } + const struct option* curr_longopt = longopts; + std::string argv_str = argv[optind]; + size_t found = argv_str.find_first_of("="); + std::string key = argv_str.substr( + 2, (found == std::string::npos) ? std::string::npos : (found - 2)); + int option_index = 0; + for (curr_longopt, option_index; curr_longopt->name; + curr_longopt++, option_index++) { + if (key == curr_longopt->name) { + if (longind != NULL) + (*longind) = option_index; + if (curr_longopt->has_arg == required_argument) { + if (found == std::string::npos) { + optind++; + if (optind >= argc) { + std::cerr << argv[0] << ": option '" << argv_str + << "' requires an argument" << std::endl; + return '?'; + } + optarg = argv[optind]; + } else { + optarg = (argv[optind] + found + 1); + } + } + optind++; + return curr_longopt->val; + } + } + return -1; +} +#endif + +#include +#include +#include +#include + +#include "common.h" + +#define TRITONJSON_STATUSTYPE TRITONSERVER_Error* +#define TRITONJSON_STATUSRETURN(M) \ + return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, (M).c_str()) +#define TRITONJSON_STATUSSUCCESS nullptr +#include "triton/common/triton_json.h" + + +namespace triton { namespace server { + +// [FIXME] expose following parse helpers for other type of parser +namespace { + +// A wrapper around std::stoi, std::stoull, std::stoll, std::stod +// to catch `invalid argument` and `out of range` exceptions +template +T StringTo(const std::string& arg); + +template <> +int +StringTo(const std::string& arg) +{ + return std::stoi(arg); +} + +#ifdef TRITON_ENABLE_TRACING +template <> +uint32_t +StringTo(const std::string& arg) +{ + return std::stoul(arg); +} +#endif // TRITON_ENABLE_TRACING + +template <> +uint64_t +StringTo(const std::string& arg) +{ + return std::stoull(arg); +} + +template <> +int64_t +StringTo(const std::string& arg) +{ + return std::stoll(arg); +} + +template <> +double +StringTo(const std::string& arg) +{ + return std::stod(arg); +} + +// There must be specialization for the types to be parsed into so that +// the argument is properly validated and parsed. Attempted to use input +// operator (>>) but it will consume improper argument without error +// (i.e. parse "1.4" to 'int' will return 1 but we want to report error). +template +T +ParseOption(const std::string& arg) +{ + try { + return StringTo(arg); + } + catch (const std::invalid_argument& ia) { + std::stringstream ss; + ss << "Invalid option value. Got " << arg << std::endl; + throw ParseException(ss.str()); + } + catch (const std::out_of_range& oor) { + std::stringstream ss; + ss << "Provided option value is out of bound. Got " << arg << std::endl; + throw ParseException(ss.str()); + } +} + +template <> +bool +ParseOption(const std::string& arg) +{ + // 'arg' need to comply with template declaration + std::string larg = arg; + std::transform(larg.begin(), larg.end(), larg.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if ((larg == "true") || (larg == "on") || (larg == "1")) { + return true; + } + if ((larg == "false") || (larg == "off") || (larg == "0")) { + return false; + } + + throw ParseException("invalid value for bool option: " + arg); +} + +// Condition here merely to avoid compilation error, this function will +// be defined but not used otherwise. +#ifdef TRITON_ENABLE_LOGGING +int +ParseIntBoolOption(std::string arg) +{ + std::transform(arg.begin(), arg.end(), arg.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if (arg == "true") { + return 1; + } + if (arg == "false") { + return 0; + } + + return ParseOption(arg); +} +#endif // TRITON_ENABLE_LOGGING + +std::string +PairsToJsonStr(std::vector> settings) +{ + triton::common::TritonJson::Value json( + triton::common::TritonJson::ValueType::OBJECT); + for (const auto& setting : settings) { + const auto& key = setting.first; + const auto& value = setting.second; + json.SetStringObject(key.c_str(), value); + } + triton::common::TritonJson::WriteBuffer buffer; + auto err = json.Write(&buffer); + if (err != nullptr) { + LOG_TRITONSERVER_ERROR(err, "failed to convert config to JSON"); + } + return buffer.Contents(); +} + +template +std::pair +ParsePairOption(const std::string& arg, const std::string& delim_str) +{ + int delim = arg.find(delim_str); + + if ((delim < 0)) { + std::stringstream ss; + ss << "Cannot parse pair option due to incorrect number of inputs." + "-- argument requires format " + << delim_str << ". " + << "Found: " << arg << std::endl; + throw ParseException(ss.str()); + } + + std::string first_string = arg.substr(0, delim); + std::string second_string = arg.substr(delim + delim_str.length()); + + // Specific conversion from key-value string to actual key-value type, + // should be extracted out of this function if we need to parse + // more pair option of different types. + return {ParseOption(first_string), ParseOption(second_string)}; +} + +// Split 'options' by 'delim_str' and place split strings into a vector +std::vector +SplitOptions(std::string options, const std::string& delim_str) +{ + std::vector res; + + int delim = options.find(delim_str); + while ((delim >= 0)) { + res.emplace_back(options.substr(0, delim)); + options = options.substr(delim + delim_str.length()); + delim = options.find(delim_str); + } + // include last element + res.emplace_back(options); + return res; +} + +} // namespace + +enum TritonOptionId { + OPTION_HELP = 1000, +#ifdef TRITON_ENABLE_LOGGING + OPTION_LOG_VERBOSE, + OPTION_LOG_INFO, + OPTION_LOG_WARNING, + OPTION_LOG_ERROR, + OPTION_LOG_FORMAT, + OPTION_LOG_FILE, +#endif // TRITON_ENABLE_LOGGING + OPTION_ID, + OPTION_MODEL_REPOSITORY, + OPTION_EXIT_ON_ERROR, + OPTION_DISABLE_AUTO_COMPLETE_CONFIG, + OPTION_STRICT_MODEL_CONFIG, + OPTION_STRICT_READINESS, +#if defined(TRITON_ENABLE_HTTP) + OPTION_ALLOW_HTTP, + OPTION_HTTP_HEADER_FORWARD_PATTERN, + OPTION_HTTP_PORT, + OPTION_REUSE_HTTP_PORT, + OPTION_HTTP_ADDRESS, + OPTION_HTTP_THREAD_COUNT, + OPTION_HTTP_RESTRICTED_API, +#endif // TRITON_ENABLE_HTTP +#if defined(TRITON_ENABLE_GRPC) + OPTION_ALLOW_GRPC, + OPTION_GRPC_PORT, + OPTION_REUSE_GRPC_PORT, + OPTION_GRPC_ADDRESS, + OPTION_GRPC_HEADER_FORWARD_PATTERN, + OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE, + OPTION_GRPC_USE_SSL, + OPTION_GRPC_USE_SSL_MUTUAL, + OPTION_GRPC_SERVER_CERT, + OPTION_GRPC_SERVER_KEY, + OPTION_GRPC_ROOT_CERT, + OPTION_GRPC_RESPONSE_COMPRESSION_LEVEL, + OPTION_GRPC_ARG_KEEPALIVE_TIME_MS, + OPTION_GRPC_ARG_KEEPALIVE_TIMEOUT_MS, + OPTION_GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, + OPTION_GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, + OPTION_GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS, + OPTION_GRPC_ARG_HTTP2_MAX_PING_STRIKES, + OPTION_GRPC_RESTRICTED_PROTOCOL, + OPTION_GRPC_ARG_MAX_CONNECTION_AGE_MS, + OPTION_GRPC_ARG_MAX_CONNECTION_AGE_GRACE_MS, +#endif // TRITON_ENABLE_GRPC +#if defined(TRITON_ENABLE_SAGEMAKER) + OPTION_ALLOW_SAGEMAKER, + OPTION_SAGEMAKER_PORT, + OPTION_SAGEMAKER_SAFE_PORT_RANGE, + OPTION_SAGEMAKER_THREAD_COUNT, +#endif // TRITON_ENABLE_SAGEMAKER +#if defined(TRITON_ENABLE_VERTEX_AI) + OPTION_ALLOW_VERTEX_AI, + OPTION_VERTEX_AI_PORT, + OPTION_VERTEX_AI_THREAD_COUNT, + OPTION_VERTEX_AI_DEFAULT_MODEL, +#endif // TRITON_ENABLE_VERTEX_AI +#ifdef TRITON_ENABLE_METRICS + OPTION_ALLOW_METRICS, + OPTION_ALLOW_GPU_METRICS, + OPTION_ALLOW_CPU_METRICS, + OPTION_METRICS_ADDRESS, + OPTION_METRICS_PORT, + OPTION_METRICS_INTERVAL_MS, + OPTION_METRICS_CONFIG, +#endif // TRITON_ENABLE_METRICS +#ifdef TRITON_ENABLE_TRACING + OPTION_TRACE_FILEPATH, + OPTION_TRACE_LEVEL, + OPTION_TRACE_RATE, + OPTION_TRACE_COUNT, + OPTION_TRACE_LOG_FREQUENCY, + OPTION_TRACE_CONFIG, +#endif // TRITON_ENABLE_TRACING + OPTION_MODEL_CONTROL_MODE, + OPTION_POLL_REPO_SECS, + OPTION_STARTUP_MODEL, + OPTION_CUSTOM_MODEL_CONFIG_NAME, + OPTION_RATE_LIMIT, + OPTION_RATE_LIMIT_RESOURCE, + OPTION_PINNED_MEMORY_POOL_BYTE_SIZE, + OPTION_CUDA_MEMORY_POOL_BYTE_SIZE, + OPTION_CUDA_VIRTUAL_ADDRESS_SIZE, + OPTION_RESPONSE_CACHE_BYTE_SIZE, + OPTION_CACHE_CONFIG, + OPTION_CACHE_DIR, + OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY, + OPTION_EXIT_TIMEOUT_SECS, + OPTION_BACKEND_DIR, + OPTION_REPOAGENT_DIR, + OPTION_BUFFER_MANAGER_THREAD_COUNT, + OPTION_MODEL_LOAD_THREAD_COUNT, + OPTION_MODEL_LOAD_RETRY_COUNT, + OPTION_BACKEND_CONFIG, + OPTION_HOST_POLICY, + OPTION_MODEL_LOAD_GPU_LIMIT, + OPTION_MODEL_NAMESPACING, + OPTION_ENABLE_PEER_ACCESS +}; + +void +TritonParser::SetupOptions() +{ + global_options_.push_back( + {OPTION_HELP, "help", Option::ArgNone, "Print usage"}); + + server_options_.push_back( + {OPTION_ID, "id", Option::ArgStr, "Identifier for this server."}); + server_options_.push_back( + {OPTION_EXIT_TIMEOUT_SECS, "exit-timeout-secs", Option::ArgInt, + "Timeout (in seconds) when exiting to wait for in-flight inferences to " + "finish. After the timeout expires the server exits even if inferences " + "are still in flight."}); + + model_repo_options_.push_back( + {OPTION_MODEL_REPOSITORY, "model-store", Option::ArgStr, + "Equivalent to --model-repository."}); + model_repo_options_.push_back( + {OPTION_MODEL_REPOSITORY, "model-repository", Option::ArgStr, + "Path to model repository directory. It may be specified multiple times " + "to add multiple model repositories. Note that if a model is not unique " + "across all model repositories at any time, the model will not be " + "available."}); + model_repo_options_.push_back( + {OPTION_EXIT_ON_ERROR, "exit-on-error", Option::ArgBool, + "Exit the inference server if an error occurs during initialization."}); + model_repo_options_.push_back( + {OPTION_DISABLE_AUTO_COMPLETE_CONFIG, "disable-auto-complete-config", + Option::ArgNone, + "If set, disables the triton and backends from auto completing model " + "configuration files. Model configuration files must be provided and " + "all required " + "configuration settings must be specified."}); + model_repo_options_.push_back( + {OPTION_STRICT_READINESS, "strict-readiness", Option::ArgBool, + "If true /v2/health/ready endpoint indicates ready if the server " + "is responsive and all models are available. If false " + "/v2/health/ready endpoint indicates ready if server is responsive " + "even if some/all models are unavailable."}); + model_repo_options_.push_back( + {OPTION_MODEL_CONTROL_MODE, "model-control-mode", Option::ArgStr, + "Specify the mode for model management. Options are \"none\", \"poll\" " + "and \"explicit\". The default is \"none\". " + "For \"none\", the server will load all models in the model " + "repository(s) at startup and will not make any changes to the load " + "models after that. For \"poll\", the server will poll the model " + "repository(s) to detect changes and will load/unload models based on " + "those changes. The poll rate is controlled by 'repository-poll-secs'. " + "For \"explicit\", model load and unload is initiated by using the " + "model control APIs, and only models specified with --load-model will " + "be loaded at startup."}); + model_repo_options_.push_back( + {OPTION_POLL_REPO_SECS, "repository-poll-secs", Option::ArgInt, + "Interval in seconds between each poll of the model repository to check " + "for changes. Valid only when --model-control-mode=poll is " + "specified."}); + model_repo_options_.push_back( + {OPTION_STARTUP_MODEL, "load-model", Option::ArgStr, + "Name of the model to be loaded on server startup. It may be specified " + "multiple times to add multiple models. To load ALL models at startup, " + "specify '*' as the model name with --load-model=* as the ONLY " + "--load-model argument, this does not imply any pattern matching. " + "Specifying --load-model=* in conjunction with another --load-model " + "argument will result in error. Note that this option will only take " + "effect if --model-control-mode=explicit is true."}); + model_repo_options_.push_back( + {OPTION_CUSTOM_MODEL_CONFIG_NAME, "model-config-name", Option::ArgStr, + "The custom configuration name for models to load." + "The name should not contain any space character." + "For example: --model-config-name=h100. " + "If --model-config-name is not set, Triton will use the default " + "config.pbtxt."}); + model_repo_options_.push_back( + {OPTION_MODEL_LOAD_THREAD_COUNT, "model-load-thread-count", + Option::ArgInt, + "The number of threads used to concurrently load models in " + "model repositories. Default is 4."}); + model_repo_options_.push_back( + {OPTION_MODEL_LOAD_RETRY_COUNT, "model-load-retry-count", Option::ArgInt, + "The number of retry to load a model in " + "model repositories. Default is 0."}); + model_repo_options_.push_back( + {OPTION_MODEL_NAMESPACING, "model-namespacing", Option::ArgBool, + "Whether model namespacing is enable or not. If true, models with the " + "same name can be served if they are in different namespace."}); + model_repo_options_.push_back( + {OPTION_ENABLE_PEER_ACCESS, "enable-peer-access", Option::ArgBool, + "Whether the server tries to enable peer access or not. Even when this " + "options is set to true, " + "peer access could still be not enabled because the underlying system " + "doesn't support it." + " The server will log a warning in this case. Default is true."}); + +#if defined(TRITON_ENABLE_HTTP) + http_options_.push_back( + {OPTION_ALLOW_HTTP, "allow-http", Option::ArgBool, + "Allow the server to listen for HTTP requests."}); + http_options_.push_back( + {OPTION_HTTP_ADDRESS, "http-address", Option::ArgStr, + "The address for the http server to bind to. Default is 0.0.0.0"}); + http_options_.push_back( + {OPTION_HTTP_PORT, "http-port", Option::ArgInt, + "The port for the server to listen on for HTTP " + "requests. Default is 8000."}); + http_options_.push_back( + {OPTION_REUSE_HTTP_PORT, "reuse-http-port", Option::ArgBool, + "Allow multiple servers to listen on the same HTTP port when every " + "server has this option set. If you plan to use this option as a way to " + "load balance between different Triton servers, the same model " + "repository or set of models must be used for every server."}); + http_options_.push_back( + {OPTION_HTTP_HEADER_FORWARD_PATTERN, "http-header-forward-pattern", + Option::ArgStr, + "The regular expression pattern that will be used for forwarding HTTP " + "headers as inference request parameters."}); + http_options_.push_back( + {OPTION_HTTP_THREAD_COUNT, "http-thread-count", Option::ArgInt, + "Number of threads handling HTTP requests."}); + http_options_.push_back( + {OPTION_HTTP_RESTRICTED_API, "http-restricted-api", + ":=", + "Specify restricted HTTP api setting. The format of this " + "flag is --http-restricted-api=,=. Where " + " is a comma-separated list of apis to be restricted. " + " will be additional header key to be checked when a HTTP request " + "is received, and is the value expected to be matched." + " Allowed APIs: " + + Join(RESTRICTED_CATEGORY_NAMES, ", ")}); +#endif // TRITON_ENABLE_HTTP + +#if defined(TRITON_ENABLE_GRPC) + grpc_options_.push_back( + {OPTION_ALLOW_GRPC, "allow-grpc", Option::ArgBool, + "Allow the server to listen for GRPC requests."}); + grpc_options_.push_back( + {OPTION_GRPC_ADDRESS, "grpc-address", Option::ArgStr, + "The address for the grpc server to binds to. Default is 0.0.0.0"}); + grpc_options_.push_back( + {OPTION_GRPC_PORT, "grpc-port", Option::ArgInt, + "The port for the server to listen on for GRPC " + "requests. Default is 8001."}); + grpc_options_.push_back( + {OPTION_REUSE_GRPC_PORT, "reuse-grpc-port", Option::ArgBool, + "Allow multiple servers to listen on the same GRPC port when every " + "server has this option set. If you plan to use this option as a way to " + "load balance between different Triton servers, the same model " + "repository or set of models must be used for every server."}); + grpc_options_.push_back( + {OPTION_GRPC_HEADER_FORWARD_PATTERN, "grpc-header-forward-pattern", + Option::ArgStr, + "The regular expression pattern that will be used for forwarding GRPC " + "headers as inference request parameters."}); + grpc_options_.push_back( + {OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE, + "grpc-infer-allocation-pool-size", Option::ArgInt, + "The maximum number of inference request/response objects that remain " + "allocated for reuse. As long as the number of in-flight requests " + "doesn't exceed this value there will be no allocation/deallocation of " + "request/response objects."}); + grpc_options_.push_back( + {OPTION_GRPC_USE_SSL, "grpc-use-ssl", Option::ArgBool, + "Use SSL authentication for GRPC requests. Default is false."}); + grpc_options_.push_back( + {OPTION_GRPC_USE_SSL_MUTUAL, "grpc-use-ssl-mutual", Option::ArgBool, + "Use mututal SSL authentication for GRPC requests. This option will " + "preempt '--grpc-use-ssl' if it is also specified. Default is false."}); + grpc_options_.push_back( + {OPTION_GRPC_SERVER_CERT, "grpc-server-cert", Option::ArgStr, + "File holding PEM-encoded server certificate. Ignored unless " + "--grpc-use-ssl is true."}); + grpc_options_.push_back( + {OPTION_GRPC_SERVER_KEY, "grpc-server-key", Option::ArgStr, + "File holding PEM-encoded server key. Ignored unless " + "--grpc-use-ssl is true."}); + grpc_options_.push_back( + {OPTION_GRPC_ROOT_CERT, "grpc-root-cert", Option::ArgStr, + "File holding PEM-encoded root certificate. Ignore unless " + "--grpc-use-ssl is false."}); + grpc_options_.push_back( + {OPTION_GRPC_RESPONSE_COMPRESSION_LEVEL, + "grpc-infer-response-compression-level", Option::ArgStr, + "The compression level to be used while returning the infer response to " + "the peer. Allowed values are none, low, medium and high. By default, " + "compression level is selected as none."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_KEEPALIVE_TIME_MS, "grpc-keepalive-time", Option::ArgInt, + "The period (in milliseconds) after which a keepalive ping is sent on " + "the transport. Default is 7200000 (2 hours)."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_KEEPALIVE_TIMEOUT_MS, "grpc-keepalive-timeout", + Option::ArgInt, + "The period (in milliseconds) the sender of the keepalive ping waits " + "for an acknowledgement. If it does not receive an acknowledgment " + "within this time, it will close the connection. " + "Default is 20000 (20 seconds)."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, + "grpc-keepalive-permit-without-calls", Option::ArgBool, + "Allows keepalive pings to be sent even if there are no calls in flight " + "(0 : false; 1 : true). Default is 0 (false)."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, + "grpc-http2-max-pings-without-data", Option::ArgInt, + "The maximum number of pings that can be sent when there is no " + "data/header frame to be sent. gRPC Core will not continue sending " + "pings if we run over the limit. Setting it to 0 allows sending pings " + "without such a restriction. Default is 2."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS, + "grpc-http2-min-recv-ping-interval-without-data", Option::ArgInt, + "If there are no data/header frames being sent on the transport, this " + "channel argument on the server side controls the minimum time " + "(in milliseconds) that gRPC Core would expect between receiving " + "successive pings. If the time between successive pings is less than " + "this time, then the ping will be considered a bad ping from the peer. " + "Such a ping counts as a ‘ping strike’. Default is 300000 (5 " + "minutes)."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_HTTP2_MAX_PING_STRIKES, "grpc-http2-max-ping-strikes", + Option::ArgInt, + "Maximum number of bad pings that the server will tolerate before " + "sending an HTTP2 GOAWAY frame and closing the transport. Setting it to " + "0 allows the server to accept any number of bad pings. Default is 2."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_MAX_CONNECTION_AGE_MS, "grpc-max-connection-age", + Option::ArgInt, + "Maximum time that a channel may exist in milliseconds." + "Default is undefined."}); + grpc_options_.push_back( + {OPTION_GRPC_ARG_MAX_CONNECTION_AGE_GRACE_MS, + "grpc-max-connection-age-grace", Option::ArgInt, + "Grace period after the channel reaches its max age. " + "Default is undefined."}); + grpc_options_.push_back( + {OPTION_GRPC_RESTRICTED_PROTOCOL, "grpc-restricted-protocol", + ":=", + "Specify restricted GRPC protocol setting. The format of this " + "flag is --grpc-restricted-protocol=,=. Where " + " is a comma-separated list of protocols to be restricted. " + " will be additional header key to be checked when a GRPC request " + "is received, and is the value expected to be matched." + " Allowed protocols: " + + Join(RESTRICTED_CATEGORY_NAMES, ", ")}); +#endif // TRITON_ENABLE_GRPC + +#ifdef TRITON_ENABLE_LOGGING + logging_options_.push_back( + {OPTION_LOG_VERBOSE, "log-verbose", Option::ArgInt, + "Set verbose logging level. Zero (0) disables verbose logging and " + "values >= 1 enable verbose logging."}); + logging_options_.push_back( + {OPTION_LOG_INFO, "log-info", Option::ArgBool, + "Enable/disable info-level logging."}); + logging_options_.push_back( + {OPTION_LOG_WARNING, "log-warning", Option::ArgBool, + "Enable/disable warning-level logging."}); + logging_options_.push_back( + {OPTION_LOG_ERROR, "log-error", Option::ArgBool, + "Enable/disable error-level logging."}); + logging_options_.push_back( + {OPTION_LOG_FORMAT, "log-format", Option::ArgStr, + "Set the logging format. Options are \"default\" and \"ISO8601\". " + "The default is \"default\". For \"default\", the log severity (L) and " + "timestamp will be logged as \"LMMDD hh:mm:ss.ssssss\". " + "For \"ISO8601\", the log format will be \"YYYY-MM-DDThh:mm:ssZ L\"."}); + logging_options_.push_back( + {OPTION_LOG_FILE, "log-file", Option::ArgStr, + "Set the name of the log output file. If specified, log outputs will be " + "saved to this file. If not specified, log outputs will stream to the " + "console."}); +#endif // TRITON_ENABLE_LOGGING + +#if defined(TRITON_ENABLE_SAGEMAKER) + sagemaker_options_.push_back( + {OPTION_ALLOW_SAGEMAKER, "allow-sagemaker", Option::ArgBool, + "Allow the server to listen for Sagemaker requests. Default is false."}); + sagemaker_options_.push_back( + {OPTION_SAGEMAKER_PORT, "sagemaker-port", Option::ArgInt, + "The port for the server to listen on for Sagemaker requests. Default " + "is 8080."}); + sagemaker_options_.push_back( + {OPTION_SAGEMAKER_SAFE_PORT_RANGE, "sagemaker-safe-port-range", + "-", + "Set the allowed port range for endpoints other than the SageMaker " + "endpoints."}); + sagemaker_options_.push_back( + {OPTION_SAGEMAKER_THREAD_COUNT, "sagemaker-thread-count", Option::ArgInt, + "Number of threads handling Sagemaker requests. Default is 8."}); +#endif // TRITON_ENABLE_SAGEMAKER + +#if defined(TRITON_ENABLE_VERTEX_AI) + vertex_options_.push_back( + {OPTION_ALLOW_VERTEX_AI, "allow-vertex-ai", Option::ArgBool, + "Allow the server to listen for Vertex AI requests. Default is true if " + "AIP_MODE=PREDICTION, false otherwise."}); + vertex_options_.push_back( + {OPTION_VERTEX_AI_PORT, "vertex-ai-port", Option::ArgInt, + "The port for the server to listen on for Vertex AI requests. Default " + "is AIP_HTTP_PORT if set, 8080 otherwise."}); + vertex_options_.push_back( + {OPTION_VERTEX_AI_THREAD_COUNT, "vertex-ai-thread-count", Option::ArgInt, + "Number of threads handling Vertex AI requests. Default is 8."}); + vertex_options_.push_back( + {OPTION_VERTEX_AI_DEFAULT_MODEL, "vertex-ai-default-model", + Option::ArgStr, + "The name of the model to use for single-model inference requests."}); +#endif // TRITON_ENABLE_VERTEX_AI + +#if defined(TRITON_ENABLE_METRICS) + metric_options_.push_back( + {OPTION_ALLOW_METRICS, "allow-metrics", Option::ArgBool, + "Allow the server to provide prometheus metrics."}); + metric_options_.push_back( + {OPTION_ALLOW_GPU_METRICS, "allow-gpu-metrics", Option::ArgBool, + "Allow the server to provide GPU metrics. Ignored unless " + "--allow-metrics is true."}); + metric_options_.push_back( + {OPTION_ALLOW_CPU_METRICS, "allow-cpu-metrics", Option::ArgBool, + "Allow the server to provide CPU metrics. Ignored unless " + "--allow-metrics is true."}); + metric_options_.push_back( + {OPTION_METRICS_ADDRESS, "metrics-address", Option::ArgStr, + "The address for the metrics server to bind to. Default is the same as " + "--http-address if built with HTTP support. Otherwise, default is " + "0.0.0.0"}); + metric_options_.push_back( + {OPTION_METRICS_PORT, "metrics-port", Option::ArgInt, + "The port reporting prometheus metrics. Default is 8002."}); + metric_options_.push_back( + {OPTION_METRICS_INTERVAL_MS, "metrics-interval-ms", Option::ArgFloat, + "Metrics will be collected once every " + "milliseconds. Default is 2000 milliseconds."}); + metric_options_.push_back( + {OPTION_METRICS_CONFIG, "metrics-config", "=", + "Specify a metrics-specific configuration setting. The format of this " + "flag is --metrics-config==. It can be specified " + "multiple times."}); +#endif // TRITON_ENABLE_METRICS + +#ifdef TRITON_ENABLE_TRACING + tracing_options_.push_back( + {OPTION_TRACE_CONFIG, "trace-config", ",=", + "Specify global or trace mode specific configuration setting. " + "The format of this flag is --trace-config " + ",=. " + "Where is either \"triton\" or \"opentelemetry\". " + "The default is \"triton\". To specify global trace settings " + "(level, rate, count, or mode), the format would be " + "--trace-config =. For \"triton\" mode, the server will " + "use " + "Triton's Trace APIs. For \"opentelemetry\" mode, the server will use " + "OpenTelemetry's APIs to generate, collect and export traces for " + "individual inference requests."}); +#endif // TRITON_ENABLE_TRACING + + cache_options_.push_back( + {OPTION_CACHE_CONFIG, "cache-config", ",=", + "Specify a cache-specific configuration setting. The format of this " + "flag is --cache-config=,=. Where " + " is the name of the cache, such as 'local' or 'redis'. " + "Example: --cache-config=local,size=1048576 will configure a 'local' " + "cache implementation with a fixed buffer pool of size 1048576 bytes."}); + cache_options_.push_back( + {OPTION_CACHE_DIR, "cache-directory", Option::ArgStr, + "The global directory searched for cache shared libraries. Default is " + "'/opt/tritonserver/caches'. This directory is expected to contain a " + "cache implementation as a shared library with the name " + "'libtritoncache.so'."}); + + + rate_limiter_options_.push_back( + // FIXME: fix the default to execution_count once RL logic is complete. + {OPTION_RATE_LIMIT, "rate-limit", Option::ArgStr, + "Specify the mode for rate limiting. Options are \"execution_count\" " + "and \"off\". The default is \"off\". For " + "\"execution_count\", the server will determine the instance using " + "configured priority and the number of time the instance has been " + "used to run inference. The inference will finally be executed once " + "the required resources are available. For \"off\", the server will " + "ignore any rate limiter config and run inference as soon as an " + "instance is ready."}); + rate_limiter_options_.push_back( + {OPTION_RATE_LIMIT_RESOURCE, "rate-limit-resource", + "::", + "The number of resources available to the server. The format of this " + "flag is --rate-limit-resource=::. The " + " is optional and if not listed will be applied to every " + "device. If the resource is specified as \"GLOBAL\" in the model " + "configuration the resource is considered shared among all the devices " + "in the system. The property is ignored for such resources. " + "This flag can be specified multiple times to specify each resources " + "and their availability. By default, the max across all instances that " + "list the resource is selected as its availability. The values for this " + "flag is case-insensitive."}); + + memory_device_options_.push_back( + {OPTION_PINNED_MEMORY_POOL_BYTE_SIZE, "pinned-memory-pool-byte-size", + Option::ArgInt, + "The total byte size that can be allocated as pinned system memory. " + "If GPU support is enabled, the server will allocate pinned system " + "memory to accelerate data transfer between host and devices until it " + "exceeds the specified byte size. If 'numa-node' is configured via " + "--host-policy, the pinned system memory of the pool size will be " + "allocated on each numa node. This option will not affect the " + "allocation conducted by the backend frameworks. Default is 256 MB."}); + memory_device_options_.push_back( + {OPTION_CUDA_MEMORY_POOL_BYTE_SIZE, "cuda-memory-pool-byte-size", + ":", + "The total byte size that can be allocated as CUDA memory for the GPU " + "device. If GPU support is enabled, the server will allocate CUDA " + "memory to minimize data transfer between host and devices until it " + "exceeds the specified byte size. This option will not affect the " + "allocation conducted by the backend frameworks. The argument should be " + "2 integers separated by colons in the format " + ":. This option can be used multiple " + "times, but only once per GPU device. Subsequent uses will overwrite " + "previous uses for the same GPU device. Default is 64 MB."}); + memory_device_options_.push_back( + {OPTION_CUDA_VIRTUAL_ADDRESS_SIZE, "cuda-virtual-address-size", + ":", + "The total CUDA virtual address size that will be used for each " + "implicit state when growable memory is used. This value determines " + "the maximum size of each implicit state. The state size cannot go " + "beyond this value. The argument should be " + "2 integers separated by colons in the format " + ":. This option can be used " + "multiple " + "times, but only once per GPU device. Subsequent uses will overwrite " + "previous uses for the same GPU device. Default is 1 GB."}); + memory_device_options_.push_back( + {OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY, + "min-supported-compute-capability", Option::ArgFloat, + "The minimum supported CUDA compute capability. GPUs that don't support " + "this compute capability will not be used by the server."}); + memory_device_options_.push_back( + {OPTION_BUFFER_MANAGER_THREAD_COUNT, "buffer-manager-thread-count", + Option::ArgInt, + "The number of threads used to accelerate copies and other operations " + "required to manage input and output tensor contents. Default is 0."}); + memory_device_options_.push_back( + {OPTION_HOST_POLICY, "host-policy", ",=", + "Specify a host policy setting associated with a policy name. The " + "format of this flag is --host-policy=,=. " + "Currently supported settings are 'numa-node', 'cpu-cores'. Note that " + "'numa-node' setting will affect pinned memory pool behavior, see " + "--pinned-memory-pool for more detail."}); + memory_device_options_.push_back( + {OPTION_MODEL_LOAD_GPU_LIMIT, "model-load-gpu-limit", + ":", + "Specify the limit on GPU memory usage as a fraction. If model loading " + "on the device is requested and the current memory usage exceeds the " + "limit, the load will be rejected. If not specified, the limit will " + "not be set."}); + + backend_options_.push_back( + {OPTION_BACKEND_DIR, "backend-directory", Option::ArgStr, + "The global directory searched for backend shared libraries. Default is " + "'/opt/tritonserver/backends'."}); + backend_options_.push_back( + {OPTION_BACKEND_CONFIG, "backend-config", ",=", + "Specify a backend-specific configuration setting. The format of this " + "flag is --backend-config=,=. Where " + " is the name of the backend, such as 'tensorrt'."}); + + repo_agent_options_.push_back( + {OPTION_REPOAGENT_DIR, "repoagent-directory", Option::ArgStr, + "The global directory searched for repository agent shared libraries. " + "Default is '/opt/tritonserver/repoagents'."}); + + // Deprecations + deprecated_options_.push_back( + {OPTION_STRICT_MODEL_CONFIG, "strict-model-config", Option::ArgBool, + "DEPRECATED: If true model configuration files must be provided and all " + "required " + "configuration settings must be specified. If false the model " + "configuration may be absent or only partially specified and the " + "server will attempt to derive the missing required configuration."}); + deprecated_options_.push_back( + {OPTION_RESPONSE_CACHE_BYTE_SIZE, "response-cache-byte-size", + Option::ArgInt, "DEPRECATED: Please use --cache-config instead."}); +#ifdef TRITON_ENABLE_TRACING + deprecated_options_.push_back( + {OPTION_TRACE_FILEPATH, "trace-file", Option::ArgStr, + "DEPRECATED: Please use --trace-config triton,file=" + " Set the file where trace output will be saved. If " + "--trace-log-frequency" + " is also specified, this argument value will be the prefix of the files" + " to save the trace output. See --trace-log-frequency for detail."}); + deprecated_options_.push_back( + {OPTION_TRACE_LEVEL, "trace-level", Option::ArgStr, + "DEPRECATED: Please use --trace-config level=" + "Specify a trace level. OFF to disable tracing, TIMESTAMPS to " + "trace timestamps, TENSORS to trace tensors. It may be specified " + "multiple times to trace multiple information. Default is OFF."}); + deprecated_options_.push_back( + {OPTION_TRACE_RATE, "trace-rate", Option::ArgInt, + "DEPRECATED: Please use --trace-config rate=" + "Set the trace sampling rate. Default is 1000."}); + deprecated_options_.push_back( + {OPTION_TRACE_COUNT, "trace-count", Option::ArgInt, + "DEPRECATED: Please use --trace-config count=" + "Set the number of traces to be sampled. If the value is -1, the number " + "of traces to be sampled will not be limited. Default is -1."}); + deprecated_options_.push_back( + {OPTION_TRACE_LOG_FREQUENCY, "trace-log-frequency", Option::ArgInt, + "DEPRECATED: Please use --trace-config triton,log-frequency=" + "Set the trace log frequency. If the value is 0, Triton will only log " + "the trace output to when shutting down. Otherwise, Triton " + "will log the trace output to . when it collects the " + "specified number of traces. For example, if the log frequency is 100, " + "when Triton collects the 100-th trace, it logs the traces to file " + ".0, and when it collects the 200-th trace, it logs the " + "101-th to the 200-th traces to file .1. Default is 0."}); +#endif // TRITON_ENABLE_TRACING +} + +void +TritonParser::SetupOptionGroups() +{ + SetupOptions(); + option_groups_.emplace_back(GLOBAL_OPTION_GROUP, global_options_); + option_groups_.emplace_back("Server", server_options_); + option_groups_.emplace_back("Logging", logging_options_); + option_groups_.emplace_back("Model Repository", model_repo_options_); + option_groups_.emplace_back("HTTP", http_options_); + option_groups_.emplace_back("GRPC", grpc_options_); + option_groups_.emplace_back("Sagemaker", sagemaker_options_); + option_groups_.emplace_back("Vertex", vertex_options_); + option_groups_.emplace_back("Metrics", metric_options_); + option_groups_.emplace_back("Tracing", tracing_options_); + option_groups_.emplace_back("Backend", backend_options_); + option_groups_.emplace_back("Repository Agent", repo_agent_options_); + option_groups_.emplace_back("Response Cache", cache_options_); + option_groups_.emplace_back("Rate Limiter", rate_limiter_options_); + option_groups_.emplace_back( + "Memory/Device Management", memory_device_options_); + option_groups_.emplace_back("DEPRECATED", deprecated_options_); +} + +TritonParser::TritonParser() +{ + SetupOptionGroups(); +} + +void +TritonServerParameters::CheckPortCollision() +{ + // [FIXME] try to make this function endpoint type agnostic + // List of enabled services and their constraints + std::vector< + std::tuple> + ports; +#ifdef TRITON_ENABLE_HTTP + if (allow_http_) { + ports.emplace_back("HTTP", http_address_, http_port_, false, -1, -1); + } +#endif // TRITON_ENABLE_HTTP +#ifdef TRITON_ENABLE_GRPC + if (allow_grpc_) { + ports.emplace_back( + "GRPC", grpc_options_.socket_.address_, grpc_options_.socket_.port_, + false, -1, -1); + } +#endif // TRITON_ENABLE_GRPC +#ifdef TRITON_ENABLE_METRICS + if (allow_metrics_) { + ports.emplace_back( + "metrics", metrics_address_, metrics_port_, false, -1, -1); + } +#endif // TRITON_ENABLE_METRICS +#ifdef TRITON_ENABLE_SAGEMAKER + if (allow_sagemaker_) { + ports.emplace_back( + "SageMaker", sagemaker_address_, sagemaker_port_, + sagemaker_safe_range_set_, sagemaker_safe_range_.first, + sagemaker_safe_range_.second); + } +#endif // TRITON_ENABLE_SAGEMAKER +#ifdef TRITON_ENABLE_VERTEX_AI + if (allow_vertex_ai_) { + ports.emplace_back( + "Vertex AI", vertex_ai_address_, vertex_ai_port_, false, -1, -1); + } +#endif // TRITON_ENABLE_VERTEX_AI + + for (auto curr_it = ports.begin(); curr_it != ports.end(); ++curr_it) { + // If the current service doesn't specify the allow port range for other + // services, then we don't need to revisit the checked services + auto comparing_it = (std::get<3>(*curr_it)) ? ports.begin() : (curr_it + 1); + for (; comparing_it != ports.end(); ++comparing_it) { + if (comparing_it == curr_it) { + continue; + } + if (std::get<1>(*curr_it) != std::get<1>(*comparing_it)) { + continue; + } + // Set range and comparing service port is out of range + if (std::get<3>(*curr_it) && + ((std::get<2>(*comparing_it) < std::get<4>(*curr_it)) || + (std::get<2>(*comparing_it) > std::get<5>(*curr_it)))) { + std::stringstream ss; + ss << "The server cannot listen to " << std::get<0>(*comparing_it) + << " requests at port " << std::get<2>(*comparing_it) + << ", allowed port range is [" << std::get<4>(*curr_it) << ", " + << std::get<5>(*curr_it) << "]" << std::endl; + throw ParseException(ss.str()); + } + if (std::get<2>(*curr_it) == std::get<2>(*comparing_it)) { + std::stringstream ss; + ss << "The server cannot listen to " << std::get<0>(*curr_it) + << " requests " + << "and " << std::get<0>(*comparing_it) + << " requests at the same address and port " << std::get<1>(*curr_it) + << ":" << std::get<2>(*curr_it) << std::endl; + throw ParseException(ss.str()); + } + } + } +} + +TritonServerParameters::ManagedTritonServerOptionPtr +TritonServerParameters::BuildTritonServerOptions() +{ + TRITONSERVER_ServerOptions* loptions = nullptr; + THROW_IF_ERR( + ParseException, TRITONSERVER_ServerOptionsNew(&loptions), + "creating server options"); + ManagedTritonServerOptionPtr managed_ptr( + loptions, TRITONSERVER_ServerOptionsDelete); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetServerId(loptions, server_id_.c_str()), + "setting server ID"); + for (const auto& model_repository_path : model_repository_paths_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetModelRepositoryPath( + loptions, model_repository_path.c_str()), + "setting model repository path"); + } + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetModelControlMode(loptions, control_mode_), + "setting model control mode"); + for (const auto& model : startup_models_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetStartupModel(loptions, model.c_str()), + "setting startup model"); + } + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetModelConfigName( + loptions, model_config_name_.c_str()), + "setting custom model configuration name for models"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetRateLimiterMode(loptions, rate_limit_mode_), + "setting rate limiter configuration"); + for (const auto& resource : rate_limit_resources_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsAddRateLimiterResource( + loptions, std::get<0>(resource).c_str(), std::get<1>(resource), + std::get<2>(resource)), + "setting rate limiter resource"); + } + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetPinnedMemoryPoolByteSize( + loptions, pinned_memory_pool_byte_size_), + "setting total pinned memory byte size"); + for (const auto& cuda_pool : cuda_pools_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize( + loptions, cuda_pool.first, cuda_pool.second), + "setting total CUDA memory byte size"); + } + for (const auto& cuda_virtual_address_size : cuda_virtual_address_size_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetCudaVirtualAddressSize( + loptions, cuda_virtual_address_size.first, + cuda_virtual_address_size.second), + "setting total CUDA virtual address size"); + } + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability( + loptions, min_supported_compute_capability_), + "setting minimum supported CUDA compute capability"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetExitOnError(loptions, exit_on_error_), + "setting exit on error"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetStrictModelConfig( + loptions, strict_model_config_), + "setting strict model configuration"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetStrictReadiness(loptions, strict_readiness_), + "setting strict readiness"); + // [FIXME] std::max seems to be part of Parse() + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetExitTimeout( + loptions, std::max(0, exit_timeout_secs_)), + "setting exit timeout"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetBufferManagerThreadCount( + loptions, std::max(0, buffer_manager_thread_count_)), + "setting buffer manager thread count"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetModelLoadThreadCount( + loptions, std::max(1u, model_load_thread_count_)), + "setting model load thread count"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetModelLoadRetryCount( + loptions, std::max(0u, model_load_retry_count_)), + "setting model load retry count"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetModelNamespacing( + loptions, enable_model_namespacing_), + "setting model namespacing"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetEnablePeerAccess( + loptions, enable_peer_access_), + "setting peer access"); + +#ifdef TRITON_ENABLE_LOGGING + TRITONSERVER_ServerOptionsSetLogFile(loptions, log_file_.c_str()); + THROW_IF_ERR( + ParseException, TRITONSERVER_ServerOptionsSetLogInfo(loptions, log_info_), + "setting log info enable"); + THROW_IF_ERR( + ParseException, TRITONSERVER_ServerOptionsSetLogWarn(loptions, log_warn_), + "setting log warn enable"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetLogError(loptions, log_error_), + "setting log error enable"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetLogVerbose(loptions, log_verbose_), + "setting log verbose level"); + switch (log_format_) { + case triton::common::Logger::Format::kDEFAULT: + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetLogFormat( + loptions, TRITONSERVER_LOG_DEFAULT), + "setting log format"); + break; + case triton::common::Logger::Format::kISO8601: + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetLogFormat( + loptions, TRITONSERVER_LOG_ISO8601), + "setting log format"); + break; + } +#endif // TRITON_ENABLE_LOGGING + +#ifdef TRITON_ENABLE_METRICS + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetMetrics(loptions, allow_metrics_), + "setting metrics enable"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetGpuMetrics(loptions, allow_gpu_metrics_), + "setting GPU metrics enable"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetCpuMetrics(loptions, allow_cpu_metrics_), + "setting CPU metrics enable"); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetMetricsInterval( + loptions, metrics_interval_ms_), + "setting metrics interval"); + for (const auto& mcs : metrics_config_settings_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetMetricsConfig( + loptions, std::get<0>(mcs).c_str(), std::get<1>(mcs).c_str(), + std::get<2>(mcs).c_str()), + "setting metrics configuration"); + } + +#endif // TRITON_ENABLE_METRICS + + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetBackendDirectory( + loptions, backend_dir_.c_str()), + "setting backend directory"); + + // Enable cache and configure it if a cache CLI arg is passed, + // this will allow for an empty configuration. + if (enable_cache_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetCacheDirectory( + loptions, cache_dir_.c_str()), + "setting cache directory"); + + for (const auto& cache_pair : cache_config_settings_) { + const auto& cache_name = cache_pair.first; + const auto& settings = cache_pair.second; + const auto& json_config_str = PairsToJsonStr(settings); + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetCacheConfig( + loptions, cache_name.c_str(), json_config_str.c_str()), + "setting cache configuration"); + } + } + + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetRepoAgentDirectory( + loptions, repoagent_dir_.c_str()), + "setting repository agent directory"); + for (const auto& bcs : backend_config_settings_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetBackendConfig( + loptions, std::get<0>(bcs).c_str(), std::get<1>(bcs).c_str(), + std::get<2>(bcs).c_str()), + "setting backend configuration"); + } + for (const auto& limit : load_gpu_limit_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetModelLoadDeviceLimit( + loptions, TRITONSERVER_INSTANCEGROUPKIND_GPU, limit.first, + limit.second), + "setting model load GPU limit"); + } + for (const auto& hp : host_policies_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetHostPolicy( + loptions, std::get<0>(hp).c_str(), std::get<1>(hp).c_str(), + std::get<2>(hp).c_str()), + "setting host policy"); + } + return managed_ptr; +} + +std::pair> +TritonParser::Parse(int argc, char** argv) +{ + // + // Step 1. Before parsing setup + // + TritonServerParameters lparams; + bool strict_model_config_present{false}; + bool disable_auto_complete_config{false}; + bool cache_size_present{false}; + bool cache_config_present{false}; +#ifdef TRITON_ENABLE_TRACING + bool explicit_disable_trace{false}; + bool trace_filepath_present{false}; + bool trace_level_present{false}; + bool trace_rate_present{false}; + bool trace_count_present{false}; + bool trace_log_frequency_present{false}; +#endif // TRITON_ENABLE_TRACING + int option_index = 0; + +#ifdef TRITON_ENABLE_GRPC + triton::server::grpc::Options& lgrpc_options = lparams.grpc_options_; +#endif // TRITON_ENABLE_GRPC + +#if defined TRITON_ENABLE_HTTP || defined TRITON_ENABLE_GRPC + // According to HTTP specification header names are case-insensitive. + const std::string case_insensitive_prefix{"(?i)"}; +#endif // TRITON_ENABLE_HTTP || TRITON_ENABLE_GRPC + +#ifdef TRITON_ENABLE_VERTEX_AI + // Set different default value if specific flag is set + { + auto aip_mode = + triton::server::GetEnvironmentVariableOrDefault("AIP_MODE", ""); + // Enable Vertex AI service and disable HTTP / GRPC service by default + // if detecting Vertex AI environment + if (aip_mode == "PREDICTION") { + lparams.allow_vertex_ai_ = true; +#ifdef TRITON_ENABLE_HTTP + lparams.allow_http_ = false; +#endif // TRITON_ENABLE_HTTP +#ifdef TRITON_ENABLE_GRPC + lparams.allow_grpc_ = false; +#endif // TRITON_ENABLE_GRPC + } + auto port = triton::server::GetEnvironmentVariableOrDefault( + "AIP_HTTP_PORT", "8080"); + lparams.vertex_ai_port_ = ParseOption(port); + } +#endif // TRITON_ENABLE_VERTEX_AI + + // + // Step 2. parse options + // + std::vector long_options; + for (const auto& group : option_groups_) { + for (const auto& o : group.second) { + long_options.push_back(o.GetLongOption()); + } + } + long_options.push_back({nullptr, 0, nullptr, 0}); + + int flag; + while ((flag = getopt_long( + argc, argv, "", &long_options[0], &option_index)) != -1) { + try { + switch (flag) { + case OPTION_HELP: + // [FIXME] how help is printed? + case '?': + // [FIXME] fall through when seeing this, currently consumes all + // options [FIXME] disable stderr output of `getopt_long` + throw ParseException(); +#ifdef TRITON_ENABLE_LOGGING + case OPTION_LOG_VERBOSE: + lparams.log_verbose_ = ParseIntBoolOption(optarg); + break; + case OPTION_LOG_INFO: + lparams.log_info_ = ParseOption(optarg); + break; + case OPTION_LOG_WARNING: + lparams.log_warn_ = ParseOption(optarg); + break; + case OPTION_LOG_ERROR: + lparams.log_error_ = ParseOption(optarg); + break; + case OPTION_LOG_FORMAT: { + std::string format_str(optarg); + if (format_str == "default") { + lparams.log_format_ = triton::common::Logger::Format::kDEFAULT; + } else if (format_str == "ISO8601") { + lparams.log_format_ = triton::common::Logger::Format::kISO8601; + } else { + throw ParseException("invalid argument for --log-format"); + } + break; + } + case OPTION_LOG_FILE: + lparams.log_file_ = optarg; + break; +#endif // TRITON_ENABLE_LOGGING + + case OPTION_ID: + lparams.server_id_ = optarg; + break; + case OPTION_MODEL_REPOSITORY: + lparams.model_repository_paths_.insert(optarg); + break; + case OPTION_EXIT_ON_ERROR: + lparams.exit_on_error_ = ParseOption(optarg); + break; + case OPTION_DISABLE_AUTO_COMPLETE_CONFIG: + disable_auto_complete_config = true; + break; + case OPTION_STRICT_MODEL_CONFIG: + std::cerr << "Warning: '--strict-model-config' has been deprecated! " + "Please use '--disable-auto-complete-config' instead." + << std::endl; + strict_model_config_present = true; + lparams.strict_model_config_ = ParseOption(optarg); + break; + case OPTION_STRICT_READINESS: + lparams.strict_readiness_ = ParseOption(optarg); + break; + +#ifdef TRITON_ENABLE_HTTP + case OPTION_ALLOW_HTTP: + lparams.allow_http_ = ParseOption(optarg); + break; + case OPTION_HTTP_PORT: + lparams.http_port_ = ParseOption(optarg); + break; + case OPTION_REUSE_HTTP_PORT: + lparams.reuse_http_port_ = ParseOption(optarg); + break; + case OPTION_HTTP_ADDRESS: + lparams.http_address_ = optarg; + break; + case OPTION_HTTP_HEADER_FORWARD_PATTERN: + lparams.http_forward_header_pattern_ = + std::move(case_insensitive_prefix + optarg); + break; + case OPTION_HTTP_THREAD_COUNT: + lparams.http_thread_cnt_ = ParseOption(optarg); + break; + case OPTION_HTTP_RESTRICTED_API: + ParseRestrictedFeatureOption( + optarg, long_options[option_index].name, "", "api", + lparams.http_restricted_apis_); + break; + +#endif // TRITON_ENABLE_HTTP + +#ifdef TRITON_ENABLE_SAGEMAKER + case OPTION_ALLOW_SAGEMAKER: + lparams.allow_sagemaker_ = ParseOption(optarg); + break; + case OPTION_SAGEMAKER_PORT: + lparams.sagemaker_port_ = ParseOption(optarg); + break; + case OPTION_SAGEMAKER_SAFE_PORT_RANGE: + lparams.sagemaker_safe_range_set_ = true; + lparams.sagemaker_safe_range_ = + ParsePairOption(optarg, "-"); + break; + case OPTION_SAGEMAKER_THREAD_COUNT: + lparams.sagemaker_thread_cnt_ = ParseOption(optarg); + break; +#endif // TRITON_ENABLE_SAGEMAKER + +#ifdef TRITON_ENABLE_VERTEX_AI + case OPTION_ALLOW_VERTEX_AI: + lparams.allow_vertex_ai_ = ParseOption(optarg); + break; + case OPTION_VERTEX_AI_PORT: + lparams.vertex_ai_port_ = ParseOption(optarg); + break; + case OPTION_VERTEX_AI_THREAD_COUNT: + lparams.vertex_ai_thread_cnt_ = ParseOption(optarg); + break; + case OPTION_VERTEX_AI_DEFAULT_MODEL: + lparams.vertex_ai_default_model_ = optarg; + break; +#endif // TRITON_ENABLE_VERTEX_AI + +#ifdef TRITON_ENABLE_GRPC + case OPTION_ALLOW_GRPC: + lparams.allow_grpc_ = ParseOption(optarg); + break; + case OPTION_GRPC_PORT: + lgrpc_options.socket_.port_ = ParseOption(optarg); + break; + case OPTION_REUSE_GRPC_PORT: + lgrpc_options.socket_.reuse_port_ = ParseOption(optarg); + break; + case OPTION_GRPC_ADDRESS: + lgrpc_options.socket_.address_ = optarg; + break; + case OPTION_GRPC_INFER_ALLOCATION_POOL_SIZE: + lgrpc_options.infer_allocation_pool_size_ = ParseOption(optarg); + break; + case OPTION_GRPC_USE_SSL: + lgrpc_options.ssl_.use_ssl_ = ParseOption(optarg); + break; + case OPTION_GRPC_USE_SSL_MUTUAL: + lgrpc_options.ssl_.use_mutual_auth_ = ParseOption(optarg); + lgrpc_options.ssl_.use_ssl_ = true; + break; + case OPTION_GRPC_SERVER_CERT: + lgrpc_options.ssl_.server_cert_ = optarg; + break; + case OPTION_GRPC_SERVER_KEY: + lgrpc_options.ssl_.server_key_ = optarg; + break; + case OPTION_GRPC_ROOT_CERT: + lgrpc_options.ssl_.root_cert_ = optarg; + break; + case OPTION_GRPC_RESPONSE_COMPRESSION_LEVEL: { + std::string mode_str(optarg); + std::transform( + mode_str.begin(), mode_str.end(), mode_str.begin(), ::tolower); + if (mode_str == "none") { + lgrpc_options.infer_compression_level_ = GRPC_COMPRESS_LEVEL_NONE; + } else if (mode_str == "low") { + lgrpc_options.infer_compression_level_ = GRPC_COMPRESS_LEVEL_LOW; + } else if (mode_str == "medium") { + lgrpc_options.infer_compression_level_ = GRPC_COMPRESS_LEVEL_MED; + } else if (mode_str == "high") { + lgrpc_options.infer_compression_level_ = GRPC_COMPRESS_LEVEL_HIGH; + } else { + throw ParseException( + "invalid argument for " + "--grpc_infer_response_compression_level"); + } + break; + } + case OPTION_GRPC_ARG_KEEPALIVE_TIME_MS: + lgrpc_options.keep_alive_.keepalive_time_ms_ = + ParseOption(optarg); + break; + case OPTION_GRPC_ARG_KEEPALIVE_TIMEOUT_MS: + lgrpc_options.keep_alive_.keepalive_timeout_ms_ = + ParseOption(optarg); + break; + case OPTION_GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS: + lgrpc_options.keep_alive_.keepalive_permit_without_calls_ = + ParseOption(optarg); + break; + case OPTION_GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA: + lgrpc_options.keep_alive_.http2_max_pings_without_data_ = + ParseOption(optarg); + break; + case OPTION_GRPC_ARG_HTTP2_MIN_RECV_PING_INTERVAL_WITHOUT_DATA_MS: + lgrpc_options.keep_alive_ + .http2_min_recv_ping_interval_without_data_ms_ = + ParseOption(optarg); + break; + case OPTION_GRPC_ARG_HTTP2_MAX_PING_STRIKES: + lgrpc_options.keep_alive_.http2_max_ping_strikes_ = + ParseOption(optarg); + break; + case OPTION_GRPC_ARG_MAX_CONNECTION_AGE_MS: + lgrpc_options.keep_alive_.max_connection_age_ms_ = + ParseOption(optarg); + break; + case OPTION_GRPC_ARG_MAX_CONNECTION_AGE_GRACE_MS: + lgrpc_options.keep_alive_.max_connection_age_grace_ms_ = + ParseOption(optarg); + break; + case OPTION_GRPC_RESTRICTED_PROTOCOL: { + ParseRestrictedFeatureOption( + optarg, long_options[option_index].name, + std::string( + triton::server::grpc::kRestrictedProtocolHeaderTemplate), + "protocol", lgrpc_options.restricted_protocols_); + break; + } + case OPTION_GRPC_HEADER_FORWARD_PATTERN: + lgrpc_options.forward_header_pattern_ = + std::move(case_insensitive_prefix + optarg); + break; +#endif // TRITON_ENABLE_GRPC + +#ifdef TRITON_ENABLE_METRICS + case OPTION_ALLOW_METRICS: + lparams.allow_metrics_ = ParseOption(optarg); + break; + case OPTION_ALLOW_GPU_METRICS: + lparams.allow_gpu_metrics_ = ParseOption(optarg); + break; + case OPTION_ALLOW_CPU_METRICS: + lparams.allow_cpu_metrics_ = ParseOption(optarg); + break; + case OPTION_METRICS_ADDRESS: + lparams.metrics_address_ = optarg; + break; + case OPTION_METRICS_PORT: + lparams.metrics_port_ = ParseOption(optarg); + break; + case OPTION_METRICS_INTERVAL_MS: + lparams.metrics_interval_ms_ = ParseOption(optarg); + break; + case OPTION_METRICS_CONFIG: + lparams.metrics_config_settings_.push_back( + ParseMetricsConfigOption(optarg)); + break; +#endif // TRITON_ENABLE_METRICS + +#ifdef TRITON_ENABLE_TRACING + case OPTION_TRACE_FILEPATH: { + std::cerr << "Warning: '--trace-file' has been deprecated and will be" + " removed in future releases. Please use " + "'--trace-config triton,file= instead." + << std::endl; + trace_filepath_present = true; + lparams.trace_filepath_ = optarg; + break; + } + case OPTION_TRACE_LEVEL: { + std::cerr + << "Warning: '--trace-level' has been deprecated and will be" + " removed in future releases. Please use " + "'--trace-config level= instead." + << std::endl; + trace_level_present = true; + auto parsed_level = ParseTraceLevelOption(optarg); + explicit_disable_trace |= + (parsed_level == TRITONSERVER_TRACE_LEVEL_DISABLED); + lparams.trace_level_ = static_cast( + lparams.trace_level_ | parsed_level); + break; + } + case OPTION_TRACE_RATE: + std::cerr << "Warning: '--trace-rate' has been deprecated and will be" + " removed in future releases. Please use " + "'--trace-config rate= instead." + << std::endl; + trace_rate_present = true; + lparams.trace_rate_ = ParseOption(optarg); + break; + + case OPTION_TRACE_COUNT: + std::cerr + << "Warning: '--trace-count' has been deprecated and will be" + " removed in future releases. Please use " + "'--trace-config count= instead." + << std::endl; + trace_count_present = true; + lparams.trace_count_ = ParseOption(optarg); + break; + case OPTION_TRACE_LOG_FREQUENCY: + std::cerr + << "Warning: '--trace-log-frequency' has been deprecated and " + "will be" + " removed in future releases. Please use " + "'--trace-config triton,log-frequency= instead." + << std::endl; + trace_log_frequency_present = true; + lparams.trace_log_frequency_ = ParseOption(optarg); + break; + case OPTION_TRACE_CONFIG: { + auto trace_config_setting = ParseTraceConfigOption(optarg); + triton::server::TraceConfig& tc = + lparams + .trace_config_map_[std::get<0>(trace_config_setting).c_str()]; + tc.push_back(std::make_pair( + std::get<1>(trace_config_setting).c_str(), + std::get<2>(trace_config_setting).c_str())); + break; + } +#endif // TRITON_ENABLE_TRACING + + case OPTION_POLL_REPO_SECS: + lparams.repository_poll_secs_ = ParseOption(optarg); + break; + case OPTION_STARTUP_MODEL: + lparams.startup_models_.insert(optarg); + break; + case OPTION_CUSTOM_MODEL_CONFIG_NAME: + if (std::strlen(optarg) == 0) { + throw ParseException( + "Error: empty argument for --model-config-name"); + } + lparams.model_config_name_ = optarg; + break; + case OPTION_MODEL_CONTROL_MODE: { + std::string mode_str(optarg); + std::transform( + mode_str.begin(), mode_str.end(), mode_str.begin(), ::tolower); + if (mode_str == "none") { + lparams.control_mode_ = TRITONSERVER_MODEL_CONTROL_NONE; + } else if (mode_str == "poll") { + lparams.control_mode_ = TRITONSERVER_MODEL_CONTROL_POLL; + } else if (mode_str == "explicit") { + lparams.control_mode_ = TRITONSERVER_MODEL_CONTROL_EXPLICIT; + } else { + throw ParseException("invalid argument for --model-control-mode"); + } + break; + } + case OPTION_RATE_LIMIT: { + std::string rate_limit_str(optarg); + std::transform( + rate_limit_str.begin(), rate_limit_str.end(), + rate_limit_str.begin(), ::tolower); + if (rate_limit_str == "execution_count") { + lparams.rate_limit_mode_ = TRITONSERVER_RATE_LIMIT_EXEC_COUNT; + } else if (rate_limit_str == "off") { + lparams.rate_limit_mode_ = TRITONSERVER_RATE_LIMIT_OFF; + } else { + throw ParseException("invalid argument for --rate-limit"); + } + break; + } + case OPTION_RATE_LIMIT_RESOURCE: { + std::string rate_limit_resource_str(optarg); + std::transform( + rate_limit_resource_str.begin(), rate_limit_resource_str.end(), + rate_limit_resource_str.begin(), ::tolower); + lparams.rate_limit_resources_.push_back( + ParseRateLimiterResourceOption(optarg)); + break; + } + case OPTION_PINNED_MEMORY_POOL_BYTE_SIZE: + lparams.pinned_memory_pool_byte_size_ = ParseOption(optarg); + break; + case OPTION_CUDA_MEMORY_POOL_BYTE_SIZE: + lparams.cuda_pools_.push_back( + ParsePairOption(optarg, ":")); + break; + case OPTION_CUDA_VIRTUAL_ADDRESS_SIZE: + lparams.cuda_virtual_address_size_.push_back( + ParsePairOption(optarg, ":")); + break; + case OPTION_RESPONSE_CACHE_BYTE_SIZE: { + cache_size_present = true; + const auto byte_size = std::to_string(ParseOption(optarg)); + lparams.cache_config_settings_["local"] = {{"size", byte_size}}; + std::cerr + << "Warning: '--response-cache-byte-size' has been deprecated! " + "This will default to the 'local' cache implementation with " + "the provided byte size for its config. Please use " + "'--cache-config' instead. The equivalent " + "--cache-config CLI args would be: " + "'--cache-config=local,size=" + + byte_size + "'" + << std::endl; + break; + } + case OPTION_CACHE_CONFIG: { + cache_config_present = true; + const auto cache_setting = ParseCacheConfigOption(optarg); + const auto& cache_name = std::get<0>(cache_setting); + const auto& key = std::get<1>(cache_setting); + const auto& value = std::get<2>(cache_setting); + lparams.cache_config_settings_[cache_name].push_back({key, value}); + break; + } + case OPTION_CACHE_DIR: + lparams.cache_dir_ = optarg; + break; + case OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY: + lparams.min_supported_compute_capability_ = + ParseOption(optarg); + break; + case OPTION_EXIT_TIMEOUT_SECS: + lparams.exit_timeout_secs_ = ParseOption(optarg); + break; + case OPTION_BACKEND_DIR: + lparams.backend_dir_ = optarg; + break; + case OPTION_REPOAGENT_DIR: + lparams.repoagent_dir_ = optarg; + break; + case OPTION_BUFFER_MANAGER_THREAD_COUNT: + lparams.buffer_manager_thread_count_ = ParseOption(optarg); + break; + case OPTION_MODEL_LOAD_THREAD_COUNT: + lparams.model_load_thread_count_ = ParseOption(optarg); + break; + case OPTION_MODEL_LOAD_RETRY_COUNT: + lparams.model_load_retry_count_ = ParseOption(optarg); + break; + case OPTION_BACKEND_CONFIG: + lparams.backend_config_settings_.push_back( + ParseBackendConfigOption(optarg)); + break; + case OPTION_HOST_POLICY: + lparams.host_policies_.push_back(ParseHostPolicyOption(optarg)); + break; + case OPTION_MODEL_LOAD_GPU_LIMIT: + lparams.load_gpu_limit_.emplace( + ParsePairOption(optarg, ":")); + break; + case OPTION_MODEL_NAMESPACING: + lparams.enable_model_namespacing_ = ParseOption(optarg); + break; + case OPTION_ENABLE_PEER_ACCESS: + lparams.enable_peer_access_ = ParseOption(optarg); + break; + } + } + catch (const ParseException& pe) { + if ((pe.what() != NULL) && (strlen(pe.what()) != 0)) { + std::stringstream ss; + ss << "Bad option: \"--" << long_options[option_index].name << "\".\n" + << pe.what() << std::endl; + throw ParseException(ss.str()); + } else { + // In case of `Unrecognized option` or `Help` option, just throw a + // ParseException + throw ParseException(); + } + } + } + + if (optind < argc) { + throw ParseException(std::string("Unexpected argument: ") + argv[optind]); + } + + // + // Step 3. Post parsing validation, usually for options that depend on the + // others which are not determined until after parsing. + // + + if (lparams.control_mode_ != TRITONSERVER_MODEL_CONTROL_POLL) { + lparams.repository_poll_secs_ = 0; + } + + if (lparams.startup_models_.size() > 0 && + lparams.control_mode_ != TRITONSERVER_MODEL_CONTROL_EXPLICIT) { + throw ParseException( + "Error: Use of '--load-model' requires setting " + "'--model-control-mode=explicit' as well."); + } + + +#ifdef TRITON_ENABLE_VERTEX_AI + // Set default model repository if specific flag is set, postpone the + // check to after parsing so we only monitor the default repository if + // Vertex service is allowed + if (lparams.model_repository_paths_.empty()) { + auto aip_storage_uri = + triton::server::GetEnvironmentVariableOrDefault("AIP_STORAGE_URI", ""); + if (!aip_storage_uri.empty()) { + lparams.model_repository_paths_.insert(aip_storage_uri); + } + } +#endif // TRITON_ENABLE_VERTEX_AI + +#ifdef TRITON_ENABLE_METRICS + lparams.allow_gpu_metrics_ &= lparams.allow_metrics_; + lparams.allow_cpu_metrics_ &= lparams.allow_metrics_; + // Set metrics_address to default if never specified + if (lparams.metrics_address_.empty()) { +#ifdef TRITON_ENABLE_HTTP + // If built with HTTP support, default to HTTP address + lparams.metrics_address_ = lparams.http_address_; +#else + // Otherwise have default for builds without HTTP support + lparams.metrics_address_ = "0.0.0.0"; +#endif // TRITON_ENABLE_HTTP + } +#endif // TRITON_ENABLE_METRICS + +#ifdef TRITON_ENABLE_TRACING + PostProcessTraceArgs( + lparams, trace_level_present, trace_rate_present, trace_count_present, + trace_filepath_present, trace_log_frequency_present, + explicit_disable_trace); +#endif // TRITON_ENABLE_TRACING + + // Check if there is a conflict between --disable-auto-complete-config + // and --strict-model-config + if (disable_auto_complete_config) { + if (strict_model_config_present && !lparams.strict_model_config_) { + std::cerr + << "Warning: Overriding deprecated '--strict-model-config' from " + "False to True in favor of '--disable-auto-complete-config'!" + << std::endl; + } + lparams.strict_model_config_ = true; + } + + // Check if there is a conflict between --response-cache-byte-size + // and --cache-config + if (cache_size_present && cache_config_present) { + throw ParseException( + "Error: Incompatible flags --response-cache-byte-size and " + "--cache-config both provided. Please provide one or the other."); + } + lparams.enable_cache_ = (cache_size_present || cache_config_present); + return {lparams, {}}; +} + +std::string +TritonParser::FormatUsageMessage(std::string str, int offset) +{ + int width = 60; + int current_pos = offset; + while (current_pos + width < int(str.length())) { + int n = str.rfind(' ', current_pos + width); + if (n != int(std::string::npos)) { + str.replace(n, 1, "\n\t"); + current_pos += (width + 9); + } + } + + return str; +} + +std::string +TritonParser::Usage() +{ + std::stringstream ss; + for (const auto& group : option_groups_) { + if (!group.first.empty() && !group.second.empty()) { + ss << std::endl << group.first << ":" << std::endl; + } + + for (const auto& o : group.second) { + if (!o.arg_desc_.empty()) { + ss << " --" << o.flag_ << " <" << o.arg_desc_ << ">" << std::endl + << "\t" << FormatUsageMessage(o.desc_, 0) << std::endl; + } else { + ss << " --" << o.flag_ << std::endl + << "\t" << FormatUsageMessage(o.desc_, 0) << std::endl; + } + } + } + return ss.str(); +} + +std::tuple +TritonParser::ParseMetricsConfigOption(const std::string& arg) +{ + // Format is "=" for generic configs/settings + int delim_setting = arg.find("="); + if (delim_setting < 0) { + std::stringstream ss; + ss << "--metrics-config option format is " + << "=. Got " << arg << std::endl; + throw ParseException(ss.str()); + } + + // Break section before "=" into substr to avoid matching commas + // in setting values. + auto name_substr = arg.substr(0, delim_setting); + int delim_name = name_substr.find(","); + + // No name-specific configs currently supported, though it may be in + // the future. Map global configs to empty string like other configs for + // now. + std::string name_string = std::string(); + if (delim_name >= 0) { + std::stringstream ss; + ss << "--metrics-config option format is " + << "=. Got " << arg << std::endl; + throw ParseException(ss.str()); + } // else global metrics config + + std::string setting_string = + arg.substr(delim_name + 1, delim_setting - delim_name - 1); + std::string value_string = arg.substr(delim_setting + 1); + + if (setting_string.empty() || value_string.empty()) { + std::stringstream ss; + ss << "--metrics-config option format is " + << "=. Got " << arg << std::endl; + throw ParseException(ss.str()); + } + + return {name_string, setting_string, value_string}; +} + +std::tuple +TritonParser::ParseCacheConfigOption(const std::string& arg) +{ + // Format is ",=" for specific + // config/settings and "=" for cache agnostic + // configs/settings + int delim_name = arg.find(","); + int delim_setting = arg.find("=", delim_name + 1); + + std::string name_string = std::string(); + if (delim_name > 0) { + name_string = arg.substr(0, delim_name); + } + // No cache-agnostic global settings are currently supported + else { + std::stringstream ss; + ss << "No cache specified. --cache-config option format is " + << ",=. Got " << arg << std::endl; + throw ParseException(ss.str()); + } + + if (delim_setting < 0) { + std::stringstream ss; + ss << "--cache-config option format is ',='. Got " + << arg << std::endl; + throw ParseException(ss.str()); + } + std::string setting_string = + arg.substr(delim_name + 1, delim_setting - delim_name - 1); + std::string value_string = arg.substr(delim_setting + 1); + + if (setting_string.empty() || value_string.empty()) { + std::stringstream ss; + ss << "--cache-config option format is ',='. Got " + << arg << std::endl; + throw ParseException(ss.str()); + } + + return {name_string, setting_string, value_string}; +} + +std::tuple +TritonParser::ParseRateLimiterResourceOption(const std::string& arg) +{ + std::string error_string( + "--rate-limit-resource option format is " + "'::' or ':'. " + "Got " + + arg); + + std::string name_string(""); + int count = -1; + int device_id = -1; + + size_t delim_first = arg.find(":"); + size_t delim_second = arg.find(":", delim_first + 1); + + if (delim_second != std::string::npos) { + // Handle format `::' + size_t delim_third = arg.find(":", delim_second + 1); + if (delim_third != std::string::npos) { + throw ParseException(error_string); + } + name_string = arg.substr(0, delim_first); + count = ParseOption( + arg.substr(delim_first + 1, delim_second - delim_first - 1)); + device_id = ParseOption(arg.substr(delim_second + 1)); + } else if (delim_first != std::string::npos) { + // Handle format `:' + name_string = arg.substr(0, delim_first); + count = ParseOption(arg.substr(delim_first + 1)); + } else { + // If no colons found + throw ParseException(error_string); + } + + return {name_string, count, device_id}; +} + +std::tuple +TritonParser::ParseBackendConfigOption(const std::string& arg) +{ + // Format is ",=" for specific + // config/settings and "=" for backend agnostic + // configs/settings + int delim_name = arg.find(","); + int delim_setting = arg.find("=", delim_name + 1); + + std::string name_string = std::string(); + if (delim_name > 0) { + name_string = arg.substr(0, delim_name); + } else if (delim_name == 0) { + std::stringstream ss; + ss << "No backend specified. --backend-config option format is " + << ",= or " + << "=. Got " << arg << std::endl; + throw ParseException(ss.str()); + } // else global backend config + + if (delim_setting < 0) { + std::stringstream ss; + ss << "--backend-config option format is ',='. Got " + << arg << std::endl; + throw ParseException(ss.str()); + } + std::string setting_string = + arg.substr(delim_name + 1, delim_setting - delim_name - 1); + std::string value_string = arg.substr(delim_setting + 1); + + if (setting_string.empty() || value_string.empty()) { + std::stringstream ss; + ss << "--backend-config option format is ',='. Got " + << arg << std::endl; + throw ParseException(ss.str()); + } + + return {name_string, setting_string, value_string}; +} + +void +TritonParser::ParseRestrictedFeatureOption( + const std::string& arg, const std::string& option_name, + const std::string& key_prefix, const std::string& feature_type, + RestrictedFeatures& restricted_features) +{ + const auto& parsed_tuple = + ParseGenericConfigOption(arg, ":", "=", option_name, "config name"); + + const auto& features = SplitOptions(std::get<0>(parsed_tuple), ","); + const auto& key = std::get<1>(parsed_tuple); + const auto& value = std::get<2>(parsed_tuple); + + for (const auto& feature : features) { + const auto& category = RestrictedFeatures::ToCategory(feature); + + if (category == RestrictedCategory::INVALID) { + std::stringstream ss; + ss << "unknown restricted " << feature_type << " '" << feature << "' " + << std::endl; + throw ParseException(ss.str()); + } + + if (restricted_features.IsRestricted(category)) { + // restricted feature can only be in one group + std::stringstream ss; + ss << "restricted " << feature_type << " '" << feature + << "' can not be specified in multiple config groups" << std::endl; + throw ParseException(ss.str()); + } + restricted_features.Insert( + category, std::make_pair(key_prefix + key, value)); + } +} + +std::tuple +TritonParser::ParseHostPolicyOption(const std::string& arg) +{ + return ParseGenericConfigOption(arg, ",", "=", "host-policy", "policy name"); +} + +std::tuple +TritonParser::ParseGenericConfigOption( + const std::string& arg, const std::string& first_delim, + const std::string& second_delim, const std::string& option_name, + const std::string& config_name) +{ + // Format is ",=" + int delim_name = arg.find(first_delim); + int delim_setting = arg.find(second_delim, delim_name + 1); + + std::string error_string = "--" + option_name + " option format is '<" + + config_name + ">" + first_delim + "" + + second_delim + "'. Got " + arg + "\n"; + + // Check for 2 semicolons + if ((delim_name < 0) || (delim_setting < 0)) { + throw ParseException(error_string); + } + + std::string name_string = arg.substr(0, delim_name); + std::string setting_string = + arg.substr(delim_name + 1, delim_setting - delim_name - 1); + std::string value_string = arg.substr(delim_setting + 1); + + if (name_string.empty() || setting_string.empty() || value_string.empty()) { + throw ParseException(error_string); + } + + return {name_string, setting_string, value_string}; +} + +#ifdef TRITON_ENABLE_TRACING +TRITONSERVER_InferenceTraceLevel +TritonParser::ParseTraceLevelOption(std::string arg) +{ + std::transform(arg.begin(), arg.end(), arg.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if ((arg == "false") || (arg == "off")) { + return TRITONSERVER_TRACE_LEVEL_DISABLED; + } + if ((arg == "true") || (arg == "on") || (arg == "min") || (arg == "max") || + (arg == "timestamps")) { + return TRITONSERVER_TRACE_LEVEL_TIMESTAMPS; + } + if (arg == "tensors") { + return TRITONSERVER_TRACE_LEVEL_TENSORS; + } + + throw ParseException("invalid value for trace level option: " + arg); +} + +InferenceTraceMode +TritonParser::ParseTraceModeOption(std::string arg) +{ + std::transform(arg.begin(), arg.end(), arg.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if (arg == "triton") { + return TRACE_MODE_TRITON; + } + if (arg == "opentelemetry") { + return TRACE_MODE_OPENTELEMETRY; + } + + throw ParseException( + "invalid value for trace mode option: " + arg + + ". Available options are \"triton\" and \"opentelemetry\""); +} + +std::tuple +TritonParser::ParseTraceConfigOption(const std::string& arg) +{ + int delim_name = arg.find(","); + int delim_setting = arg.find("=", delim_name + 1); + + std::string name_string = std::string(); + if (delim_name > 0) { + name_string = + std::to_string(ParseTraceModeOption(arg.substr(0, delim_name))); + } else if (delim_name == 0) { + std::stringstream ss; + ss << "No trace mode specified. --trace-config option format is " + << ",= or " + << "=. Got " << arg << std::endl; + throw ParseException(ss.str()); + } // else global trace config + + if (delim_setting < 0) { + std::stringstream ss; + ss << "--trace-config option format is ',='. " + "Got " + << arg << std::endl; + throw ParseException(ss.str()); + } + std::string setting_string = + arg.substr(delim_name + 1, delim_setting - delim_name - 1); + std::string value_string = arg.substr(delim_setting + 1); + + if (setting_string.empty() || value_string.empty()) { + std::stringstream ss; + ss << "--trace-config option format is ',='. " + "Got " + << arg << std::endl; + throw ParseException(ss.str()); + } + + return {name_string, setting_string, value_string}; +} + +void +TritonParser::SetGlobalTraceArgs( + TritonServerParameters& lparams, bool trace_level_present, + bool trace_rate_present, bool trace_count_present, + bool explicit_disable_trace) +{ + for (const auto& [setting, value_variant] : lparams.trace_config_map_[""]) { + auto value = std::get(value_variant); + try { + if (setting == "rate") { + if (trace_rate_present) { + std::cerr << "Warning: Overriding deprecated '--trace-rate' " + "in favor of provided rate value in --trace-config!" + << std::endl; + } + lparams.trace_rate_ = ParseOption(value); + } + if (setting == "level") { + if (trace_level_present) { + std::cerr << "Warning: Overriding deprecated '--trace-level' " + "in favor of provided level in --trace-config!" + << std::endl; + } + auto parsed_level_config = ParseTraceLevelOption(value); + explicit_disable_trace |= + (parsed_level_config == TRITONSERVER_TRACE_LEVEL_DISABLED); + lparams.trace_level_ = static_cast( + lparams.trace_level_ | parsed_level_config); + } + if (setting == "mode") { + lparams.trace_mode_ = ParseTraceModeOption(value); + } + if (setting == "count") { + if (trace_count_present) { + std::cerr << "Warning: Overriding deprecated '--trace-count' " + "in favor of provided count in --trace-config!" + << std::endl; + } + lparams.trace_count_ = ParseOption(value); + } + } + catch (const ParseException& pe) { + std::stringstream ss; + ss << "Bad option: \"--trace-config " << setting << "\".\n" + << pe.what() << std::endl; + throw ParseException(ss.str()); + } + } +} + +void +TritonParser::SetTritonTraceArgs( + TritonServerParameters& lparams, bool trace_filepath_present, + bool trace_log_frequency_present) +{ + for (const auto& [setting, value_variant] : + lparams.trace_config_map_[std::to_string(TRACE_MODE_TRITON)]) { + auto value = std::get(value_variant); + try { + if (setting == "file") { + if (trace_filepath_present) { + std::cerr << "Warning: Overriding deprecated '--trace-file' " + "in favor of provided file in --trace-config!" + << std::endl; + } + lparams.trace_filepath_ = value; + } else if (setting == "log-frequency") { + if (trace_log_frequency_present) { + std::cerr << "Warning: Overriding deprecated '--trace-log-frequency' " + "in favor of provided log-frequency in --trace-config!" + << std::endl; + } + lparams.trace_log_frequency_ = ParseOption(value); + } + } + catch (const ParseException& pe) { + std::stringstream ss; + ss << "Bad option: \"--trace-config triton," << setting << "\".\n" + << pe.what() << std::endl; + throw ParseException(ss.str()); + } + } +} + +void +TritonParser::SetOpenTelemetryTraceArgs( + TritonServerParameters& lparams, bool trace_filepath_present, + bool trace_log_frequency_present) +{ + if (trace_filepath_present) { + std::cerr << "Warning: '--trace-file' is deprecated and will " + "be ignored with opentelemetry tracing mode. " + << std::endl; + } + if (trace_log_frequency_present) { + std::cerr << "Warning: '--trace-log-frequency' is deprecated " + "and will be ignored with opentelemetry tracing mode." + << std::endl; + } + triton::server::TraceConfig& otel_trace_settings = + lparams.trace_config_map_[std::to_string(TRACE_MODE_OPENTELEMETRY)]; + ProcessOpenTelemetryBatchSpanProcessorArgs(otel_trace_settings); +} + +void +TritonParser::ProcessOpenTelemetryBatchSpanProcessorArgs( + TraceConfig& otel_trace_settings) +{ + std::unordered_map otel_bsp_default_settings = {}; + // Set up default BatchSpanProcessor parameters, or use + // parameters, specified by environment variables + auto env_bsp_max_queue_size = triton::server::GetEnvironmentVariableOrDefault( + "OTEL_BSP_MAX_QUEUE_SIZE", "2048"); + otel_bsp_default_settings.insert(std::make_pair( + std::string("bsp_max_queue_size"), env_bsp_max_queue_size)); + auto env_bsp_schedule_delay = triton::server::GetEnvironmentVariableOrDefault( + "OTEL_BSP_SCHEDULE_DELAY", "5000"); + otel_bsp_default_settings.insert(std::make_pair( + std::string("bsp_schedule_delay"), env_bsp_schedule_delay)); + auto env_bsp_max_export_batch_size = + triton::server::GetEnvironmentVariableOrDefault( + "OTEL_BSP_MAX_EXPORT_BATCH_SIZE", "512"); + otel_bsp_default_settings.insert(std::make_pair( + std::string("bsp_max_export_batch_size"), env_bsp_max_export_batch_size)); + + // Process cmd args and convert string arguments to integers. + // Throw a ParseException for invalid arguments + for (auto& [setting, value_variant] : otel_trace_settings) { + try { + auto value = std::get(value_variant); + if (setting == "bsp_max_queue_size") { + value_variant = ParseOption(value); + otel_bsp_default_settings.erase("bsp_max_queue_size"); + } else if (setting == "bsp_schedule_delay") { + value_variant = ParseOption(value); + otel_bsp_default_settings.erase("bsp_schedule_delay"); + } else if (setting == "bsp_max_export_batch_size") { + value_variant = ParseOption(value); + otel_bsp_default_settings.erase("bsp_max_export_batch_size"); + } + } + catch (const ParseException& pe) { + std::stringstream ss; + ss << "Bad option: \"--trace-config opentelemetry," << setting << "\".\n" + << pe.what() << std::endl; + throw ParseException(ss.str()); + } + } + // If not all BSP settings were provided through cmd, + // populate OpenTelemetry's trace settings with the default value. + if (!otel_bsp_default_settings.empty()) { + for (const auto& [setting, value] : otel_bsp_default_settings) { + try { + otel_trace_settings.push_back( + std::make_pair(setting, ParseOption(value))); + } + catch (const ParseException& pe) { + std::stringstream ss; + ss << "Bad option: \"OTEL_"; + for (auto& ch : setting) { + ss << static_cast(std::toupper(ch)); + } + ss << "\".\n" << pe.what() << std::endl; + throw ParseException(ss.str()); + } + } + } +} + +void +TritonParser::PostProcessTraceArgs( + TritonServerParameters& lparams, bool trace_level_present, + bool trace_rate_present, bool trace_count_present, + bool trace_filepath_present, bool trace_log_frequency_present, + bool explicit_disable_trace) +{ + SetGlobalTraceArgs( + lparams, trace_level_present, trace_rate_present, trace_count_present, + explicit_disable_trace); + + if (lparams.trace_mode_ == TRACE_MODE_OPENTELEMETRY) { + SetOpenTelemetryTraceArgs( + lparams, trace_filepath_present, trace_log_frequency_present); + } else if (lparams.trace_mode_ == TRACE_MODE_TRITON) { + SetTritonTraceArgs( + lparams, trace_filepath_present, trace_log_frequency_present); + } + + if (explicit_disable_trace) { + lparams.trace_level_ = TRITONSERVER_TRACE_LEVEL_DISABLED; + } +} + +#endif // TRITON_ENABLE_TRACING +}} // namespace triton::server diff --git a/src/command_line_parser.h b/src/command_line_parser.h new file mode 100644 index 0000000000..762ee87b6d --- /dev/null +++ b/src/command_line_parser.h @@ -0,0 +1,353 @@ +// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "restricted_features.h" +#include "triton/common/logging.h" +#include "triton/core/tritonserver.h" +#ifdef TRITON_ENABLE_GRPC +// To avoid ambiguous reference during build +// grpc headers should be imported first +// https://github.com/open-telemetry/opentelemetry-cpp/blob/main/examples/otlp/README.md#additional-notes-regarding-abseil-library +#include "grpc/grpc_server.h" +#endif // TRITON_ENABLE_GRPC +#if defined(TRITON_ENABLE_HTTP) || defined(TRITON_ENABLE_METRICS) +#include "http_server.h" +#endif // TRITON_ENABLE_HTTP || TRITON_ENABLE_METRICS +#ifdef TRITON_ENABLE_SAGEMAKER +#include "sagemaker_server.h" +#endif // TRITON_ENABLE_SAGEMAKER +#ifdef TRITON_ENABLE_VERTEX_AI +#include "vertex_ai_server.h" +#endif // TRITON_ENABLE_VERTEX_AI + +#ifndef _WIN32 +#include +#include +#else +// Minimum implementation of for Windows +#define required_argument 1 +#define no_argument 2 +struct option { + option(const char* name, int has_arg, int* flag, int val) + : name(name), has_arg(has_arg), flag(flag), val(val) + { + } + const char* name; + int has_arg; + int* flag; + int val; +}; +#endif +#ifdef TRITON_ENABLE_TRACING +#include "tracer.h" +#endif + + +namespace triton { namespace server { + +// Command-line options +struct Option { + static constexpr const char* ArgNone = ""; + static constexpr const char* ArgBool = "boolean"; + static constexpr const char* ArgFloat = "float"; + static constexpr const char* ArgInt = "integer"; + static constexpr const char* ArgStr = "string"; + + Option(int id, std::string flag, std::string arg_desc, std::string desc) + : id_(id), flag_(flag), arg_desc_(arg_desc), desc_(desc) + { + } + + struct option GetLongOption() const + { + struct option lo { + flag_.c_str(), (!arg_desc_.empty()) ? required_argument : no_argument, + nullptr, id_ + }; + return lo; + } + + const int id_; + const std::string flag_; + const std::string arg_desc_; + const std::string desc_; +}; + +struct TritonServerParameters { + std::string server_id_{"triton"}; + bool exit_on_error_{true}; + bool strict_model_config_{false}; + bool strict_readiness_{true}; + int32_t exit_timeout_secs_{30}; +#ifdef TRITON_ENABLE_GPU + double min_supported_compute_capability_{TRITON_MIN_COMPUTE_CAPABILITY}; +#else + double min_supported_compute_capability_{0.0}; +#endif // TRITON_ENABLE_GPU + std::string repoagent_dir_{"/opt/tritonserver/repoagents"}; + std::string backend_dir_{"/opt/tritonserver/backends"}; + std::vector> + backend_config_settings_; + + // Model repository manager configuration + bool enable_model_namespacing_{false}; + bool enable_peer_access_{true}; + std::set model_repository_paths_{}; + TRITONSERVER_ModelControlMode control_mode_{TRITONSERVER_MODEL_CONTROL_NONE}; + std::set startup_models_{}; + // Interval, in seconds, when the model repository is polled for changes. + int32_t repository_poll_secs_{15}; + // Number of threads to use for concurrently loading models + uint32_t model_load_thread_count_{4}; + uint32_t model_load_retry_count_{0}; + std::map load_gpu_limit_; + // Custom model configuration file. Fall back to default config.pbtxt if not + // set. + std::string model_config_name_; + + // Rate limiter configuration + // FIXME: Once the rate limiter implementation is complete make + // EXEC_COUNT the default. + // TRITONSERVER_RateLimitMode + // rate_limit_mode_{TRITONSERVER_RATE_LIMIT_EXEC_COUNT}; + TRITONSERVER_RateLimitMode rate_limit_mode_{TRITONSERVER_RATE_LIMIT_OFF}; + std::vector> rate_limit_resources_; + + // memory pool configuration + int64_t pinned_memory_pool_byte_size_{1 << 28}; + std::list> cuda_pools_; + std::list> cuda_virtual_address_size_; + + // [FIXME] this option is broken after backend separation: this should have + // controlled backend copy behavior but not properly propagate to backend + // after separation, need to go through backend config. + int32_t buffer_manager_thread_count_{0}; + + std::vector> host_policies_; + + // Cache configuration + bool enable_cache_{false}; + std::string cache_dir_{"/opt/tritonserver/caches"}; + std::unordered_map< + std::string, std::vector>> + cache_config_settings_; + +#ifdef TRITON_ENABLE_LOGGING + bool log_info_{true}; + bool log_warn_{true}; + bool log_error_{true}; + int32_t log_verbose_{0}; + triton::common::Logger::Format log_format_{ + triton::common::Logger::Format::kDEFAULT}; + std::string log_file_{}; +#endif // TRITON_ENABLE_LOGGING + +#ifdef TRITON_ENABLE_TRACING + std::string trace_filepath_{}; + TRITONSERVER_InferenceTraceLevel trace_level_{ + TRITONSERVER_TRACE_LEVEL_DISABLED}; + int32_t trace_rate_{1000}; + int32_t trace_count_{-1}; + int32_t trace_log_frequency_{0}; + InferenceTraceMode trace_mode_{TRACE_MODE_TRITON}; + TraceConfigMap trace_config_map_; +#endif // TRITON_ENABLE_TRACING + +// The configurations for various endpoints (i.e. HTTP, GRPC and metrics) +#ifdef TRITON_ENABLE_HTTP + bool allow_http_{true}; + std::string http_address_{"0.0.0.0"}; + int32_t http_port_{8000}; + bool reuse_http_port_{false}; + std::string http_forward_header_pattern_; + // The number of threads to initialize for the HTTP front-end. + int http_thread_cnt_{8}; + RestrictedFeatures http_restricted_apis_{}; +#endif // TRITON_ENABLE_HTTP + +#ifdef TRITON_ENABLE_GRPC + bool allow_grpc_{true}; + triton::server::grpc::Options grpc_options_; +#endif // TRITON_ENABLE_GRPC + +#ifdef TRITON_ENABLE_METRICS + bool allow_metrics_{true}; + // Defaults to http_address_ if TRITON_ENABLE_HTTP is enabled for backwards, + // otherwise defaults to "0.0.0.0" for TRITON_ENABLE_HTTP is disabled. + std::string metrics_address_{""}; + int32_t metrics_port_{8002}; + // Metric settings for Triton core + float metrics_interval_ms_{2000}; + bool allow_gpu_metrics_{true}; + bool allow_cpu_metrics_{true}; + std::vector> + metrics_config_settings_; +#endif // TRITON_ENABLE_METRICS + +#ifdef TRITON_ENABLE_SAGEMAKER + bool allow_sagemaker_{false}; + std::string sagemaker_address_{"0.0.0.0"}; + int32_t sagemaker_port_{8080}; + bool sagemaker_safe_range_set_{false}; + std::pair sagemaker_safe_range_{-1, -1}; + // The number of threads to initialize for the SageMaker HTTP front-end. + int sagemaker_thread_cnt_{8}; +#endif // TRITON_ENABLE_SAGEMAKER + +#ifdef TRITON_ENABLE_VERTEX_AI + bool allow_vertex_ai_{false}; + std::string vertex_ai_address_{"0.0.0.0"}; + int32_t vertex_ai_port_{8080}; + // The number of threads to initialize for the Vertex AI HTTP front-end. + int vertex_ai_thread_cnt_{8}; + std::string vertex_ai_default_model_{}; +#endif // TRITON_ENABLE_VERTEX_AI + + // [FIXME] who should call this function? + void CheckPortCollision(); + using ManagedTritonServerOptionPtr = std::unique_ptr< + TRITONSERVER_ServerOptions, decltype(&TRITONSERVER_ServerOptionsDelete)>; + ManagedTritonServerOptionPtr BuildTritonServerOptions(); +}; + +// Exception type to be thrown if the error is parsing related +class ParseException : public std::exception { + public: + ParseException() = default; + ParseException(const std::string& message) : message_(message) {} + + virtual const char* what() const throw() { return message_.c_str(); } + + private: + const std::string message_{""}; +}; + +// [WIP] Fall-through parser, Parse() will convert the recognized options into +// parameter object and return the unrecognized options to be another argument +// list for other parser to consume. +// This allows the composition of parser chain. +// [FIXME] abstract interface, concrete class below should only parse Triton +// core and endpoint control options (endpoint specific options in their own +// parser) +class TritonParser { + public: + TritonParser(); + // Parse command line arguments into a parameters struct and transform + // the argument list to contain only unrecognized options. The content of + // unrecognized argument list shares the same lifecycle as 'argv'. + // Raise ParseException if fail to parse recognized options. + std::pair> Parse( + int argc, char** argv); + + // Return usage of all recognized options + std::string Usage(); + + private: + std::string FormatUsageMessage(std::string str, int offset); + // Helper functions for parsing options that require multi-value parsing. + std::tuple ParseCacheConfigOption( + const std::string& arg); + std::tuple ParseRateLimiterResourceOption( + const std::string& arg); + std::tuple ParseBackendConfigOption( + const std::string& arg); + std::tuple ParseHostPolicyOption( + const std::string& arg); + std::tuple ParseMetricsConfigOption( + const std::string& arg); + void ParseRestrictedFeatureOption( + const std::string& arg, const std::string& option_name, + const std::string& header_prefix, const std::string& feature_type, + RestrictedFeatures& restricted_features); +#ifdef TRITON_ENABLE_TRACING + TRITONSERVER_InferenceTraceLevel ParseTraceLevelOption(std::string arg); + InferenceTraceMode ParseTraceModeOption(std::string arg); + std::tuple ParseTraceConfigOption( + const std::string& arg); + // Helper functions for post processing for collected trace arguments. + void SetGlobalTraceArgs( + TritonServerParameters& lparams, bool trace_level_present, + bool trace_rate_present, bool trace_count_present, + bool explicit_disable_trace); + void SetTritonTraceArgs( + TritonServerParameters& lparams, bool trace_filepath_present, + bool trace_log_frequency_present); + void SetOpenTelemetryTraceArgs( + TritonServerParameters& lparams, bool trace_filepath_present, + bool trace_log_frequency_present); + void PostProcessTraceArgs( + TritonServerParameters& lparams, bool trace_level_present, + bool trace_rate_present, bool trace_count_present, + bool trace_filepath_present, bool trace_log_frequency_present, + bool explicit_disable_trace); + void ProcessOpenTelemetryBatchSpanProcessorArgs( + TraceConfig& otel_trace_settings); +#endif // TRITON_ENABLE_TRACING + // Helper function to parse option in + // "[1st_delim][2nd_delim]" format + std::tuple ParseGenericConfigOption( + const std::string& arg, const std::string& first_delim, + const std::string& second_delim, const std::string& option_name, + const std::string& config_name); + + // Initialize individual option groups + void SetupOptions(); + // Initialize option group mappings + void SetupOptionGroups(); + + // Sum of option groups: vector to maintain insertion order for Usage() + std::vector&>> option_groups_; + // Individual option groups + std::vector