Merge branch 'main' of https://github.com/triton-inference-server/core …

…into yinggeh-DLIS-6657-client-input-byte-size-check
triton-inference-server · Jul 27, 2024 · 557af46 · 557af46
2 parents 73d374e + 9ed1544
commit 557af46
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 27 deletions.
diff --git a/python/test/test_api.py b/python/test/test_api.py
@@ -70,7 +70,7 @@
     exit_on_error=True,
     strict_model_config=False,
     model_control_mode=tritonserver.ModelControlMode.EXPLICIT,
-    exit_timeout=10,
+    exit_timeout=30,
 )
 
 
@@ -357,6 +357,11 @@ def test_stop(self):
                     {
                         "backend": "python",
                         "parameters": {"decoupled": {"string_value": "False"}},
+                        # Keep instance count low for fast startup/cleanup.
+                        # Alternatively can use KIND_CPU here, but keeping gpus/count explicit.
+                        "instance_group": [
+                            {"kind": "KIND_GPU", "gpus": [0], "count": 1}
+                        ],
                     }
                 )
             },

diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -1016,6 +1016,17 @@ InferenceRequest::Normalize()
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
       *input.MutableShape() = input.OriginalShape();
+
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
+      if (input_config->is_shape_tensor()) {
+        // For a shape tensor, mark that the input is a shape tensor.
+        input.SetIsShapeTensor();
+      } else if (input_config->is_non_linear_format_io()) {
+        // If a tensor uses a non-linear IO format, indicate that the input uses
+        // a non-linear IO format.
+        input.SetIsNonLinearFormatIo();
+      }
     }
   } else {
     // Model does support Triton-style batching so each input tensor
@@ -1025,15 +1036,19 @@ InferenceRequest::Normalize()
     batch_size_ = 0;
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
 
       // For a shape tensor, keep the tensor's shape as it is and mark
       // that the input is a shape tensor.
-      const inference::ModelInput* input_config;
-      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
       if (input_config->is_shape_tensor()) {
         *input.MutableShape() = input.OriginalShape();
-        input.SetIsShapeTensor(true);
+        input.SetIsShapeTensor();
         continue;
+      } else if (input_config->is_non_linear_format_io()) {
+        // If a tensor uses a non-linear IO format, indicate that the input uses
+        // a non-linear IO format.
+        input.SetIsNonLinearFormatIo();
       }
 
       if (input.OriginalShape().size() == 0) {
@@ -1183,15 +1198,9 @@ InferenceRequest::Normalize()
     {
       const auto& data_type = input.DType();
 
-      // FIXME: Skip byte size validation for TensorRT backend because it breaks
-      // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
-      // reformat_free tensors.
-      bool skip_byte_size_check = false;
-      constexpr char trt_prefix[] = "tensorrt_";
-      const std::string& platform = model_raw_->Config().platform();
-      skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
-
-      if (!skip_byte_size_check) {
+      // Non-linear IO format input byte size validation will be handled in the
+      // TensorRT backend.
+      if (!input.IsNonLinearFormatIo()) {
         TRITONSERVER_MemoryType input_memory_type;
         // Because Triton expects STRING type to be in special format
         // (prepend 4 bytes to specify string length), so need to add all the
@@ -1201,10 +1210,13 @@ InferenceRequest::Normalize()
               input_name, input, model_name, &input_memory_type));
           // FIXME: Temporarily skips byte size checks for GPU tensors. See
           // DLIS-6820.
-          skip_byte_size_check |=
-              (input_memory_type == TRITONSERVER_MEMORY_GPU);
         } else {
-          const auto& input_dims = input.ShapeWithBatchDim();
+          // Shape tensor with dynamic batching does not introduce a new
+          // dimension to the tensor but adds an additional value to the 1-D
+          // array.
+          const std::vector<int64_t>& input_dims =
+              input.IsShapeTensor() ? input.OriginalShape()
+                                    : input.ShapeWithBatchDim();
           int64_t expected_byte_size = INT_MAX;
           expected_byte_size =
               triton::common::GetByteSize(data_type, input_dims);
@@ -1524,7 +1536,7 @@ InferenceRequest::ReportStatisticsCacheHit(MetricModelReporter* metric_reporter)
 // Input
 //
 InferenceRequest::Input::Input()
-    : is_shape_tensor_(false), data_(new MemoryReference),
+    : tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
       has_host_policy_specific_data_(false)
 {
 }
@@ -1533,16 +1545,17 @@ InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const int64_t* shape, const uint64_t dim_count)
     : name_(name), datatype_(datatype),
-      original_shape_(shape, shape + dim_count), is_shape_tensor_(false),
-      data_(new MemoryReference), has_host_policy_specific_data_(false)
+      original_shape_(shape, shape + dim_count),
+      tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
+      has_host_policy_specific_data_(false)
 {
 }
 
 InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const std::vector<int64_t>& shape)
     : name_(name), datatype_(datatype), original_shape_(shape),
-      is_shape_tensor_(false), data_(new MemoryReference),
+      tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
       has_host_policy_specific_data_(false)
 {
 }
@@ -1558,9 +1571,16 @@ InferenceRequest::Input::SetMetadata(
 }
 
 Status
-InferenceRequest::Input::SetIsShapeTensor(const bool is_shape_tensor)
+InferenceRequest::Input::SetIsShapeTensor()
+{
+  tensor_type_ = TensorType::SHAPE_TENSOR;
+  return Status::Success;
+}
+
+Status
+InferenceRequest::Input::SetIsNonLinearFormatIo()
 {
-  is_shape_tensor_ = is_shape_tensor;
+  tensor_type_ = TensorType::NON_LINEAR;
   return Status::Success;
 }
 

diff --git a/src/infer_request.h b/src/infer_request.h
@@ -82,6 +82,8 @@ class InferenceRequest {
   // Input tensor
   class Input {
    public:
+    enum class TensorType { TENSOR, SHAPE_TENSOR, NON_LINEAR };
+
     Input();
     Input(
         const std::string& name, const inference::DataType datatype,
@@ -134,10 +136,22 @@ class InferenceRequest {
     }
 
     // Whether or not the input is a tensorrt shape tensor
-    bool IsShapeTensor() const { return is_shape_tensor_; }
+    bool IsShapeTensor() const
+    {
+      return tensor_type_ == TensorType::SHAPE_TENSOR;
+    }
+
+    // Specifies whether the input uses a non-linear IO format
+    bool IsNonLinearFormatIo() const
+    {
+      return tensor_type_ == TensorType::NON_LINEAR;
+    }
 
     // Set the input to be treated as a shape tensor.
-    Status SetIsShapeTensor(const bool is_shape_tensor);
+    Status SetIsShapeTensor();
+
+    // Set the input uses a non-linear IO format
+    Status SetIsNonLinearFormatIo();
 
     // The data for this input.
     const std::shared_ptr<Memory>& Data() const { return data_; }
@@ -240,7 +254,7 @@ class InferenceRequest {
     std::vector<int64_t> original_shape_;
     std::vector<int64_t> shape_;
     std::vector<int64_t> shape_with_batch_dim_;
-    bool is_shape_tensor_;
+    TensorType tensor_type_;
     std::shared_ptr<Memory> data_;
 
     bool has_host_policy_specific_data_;

diff --git a/src/model_config_utils.cc b/src/model_config_utils.cc
@@ -418,6 +418,34 @@ ValidateIOShape(
   return Status::Success;
 }
 
+/// Validate that Non-linear format inputs or outputs are specified correctly
+/// in a model configuration.
+template <class ModelIO>
+Status
+ValidateNonLinearFormatIO(
+    const ModelIO& io, const std::string& platform, bool is_input)
+{
+  if (!io.is_non_linear_format_io()) {
+    // Nothing to validate as the tensor is not non-linear format.
+    return Status::Success;
+  }
+
+  if (platform != kTensorRTPlanPlatform) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Non-linear IO format is only supported for the TensorRT platform");
+  }
+
+  if (io.dims_size() != 3) {
+    std::string io_type = is_input ? "input" : "output";
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Non-linear IO format " + io_type + " requires 3 dims");
+  }
+
+  return Status::Success;
+}
+
 }  // namespace
 
 Status
@@ -1732,6 +1760,8 @@ ValidateModelInput(
         "shape tensors are only supported for TensorRT platform");
   }
 
+  RETURN_IF_ERROR(ValidateNonLinearFormatIO(io, platform, true /* is_input*/));
+
   return Status::Success;
 }
 
@@ -1768,6 +1798,8 @@ ValidateModelOutput(
         "shape tensors are only supported for TensorRT platform");
   }
 
+  RETURN_IF_ERROR(ValidateNonLinearFormatIO(io, platform, false /* is_input*/));
+
   return Status::Success;
 }
 

diff --git a/src/test/response_cache_test.cc b/src/test/response_cache_test.cc
@@ -70,8 +70,9 @@ InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const int64_t* shape, const uint64_t dim_count)
     : name_(name), datatype_(datatype),
-      original_shape_(shape, shape + dim_count), is_shape_tensor_(false),
-      data_(new MemoryReference), has_host_policy_specific_data_(false)
+      original_shape_(shape, shape + dim_count),
+      tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
+      has_host_policy_specific_data_(false)
 {
 }