From 83f78f36b30fd64150a67f7ecabad166f1fa47af Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Sat, 11 Jan 2025 11:34:47 -0800 Subject: [PATCH] Some optimizations and fixes * Fix py_future object lifecycle * Fix request released after complete final --- python/tritonserver/_api/_model.py | 2 -- python/tritonserver/_api/_response.py | 8 +---- python/tritonserver/_c/tritonserver_pybind.cc | 36 +++++++++++-------- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/python/tritonserver/_api/_model.py b/python/tritonserver/_api/_model.py index b016de8ab..7ccf98e58 100644 --- a/python/tritonserver/_api/_model.py +++ b/python/tritonserver/_api/_model.py @@ -211,7 +211,6 @@ async def __anext__(self): response = InferenceResponse._from_tritonserver_inference_response( self._model, - self._request, response, flags, self._inference_request.output_memory_type, @@ -319,7 +318,6 @@ def __next__(self): response = InferenceResponse._from_tritonserver_inference_response( self._model, - self._request, response, flags, self._inference_request.output_memory_type, diff --git a/python/tritonserver/_api/_response.py b/python/tritonserver/_api/_response.py index 3c7a9d58f..b488c2bda 100644 --- a/python/tritonserver/_api/_response.py +++ b/python/tritonserver/_api/_response.py @@ -44,11 +44,7 @@ from tritonserver._api._dlpack import DLDeviceType as DLDeviceType from tritonserver._api._logging import LogMessage from tritonserver._api._tensor import Tensor -from tritonserver._c.triton_bindings import ( - InternalError, - TritonError, - TRITONSERVER_InferenceRequest, -) +from tritonserver._c.triton_bindings import InternalError, TritonError from tritonserver._c.triton_bindings import TRITONSERVER_LogLevel as LogLevel from tritonserver._c.triton_bindings import TRITONSERVER_MemoryType as MemoryType from tritonserver._c.triton_bindings import ( @@ -103,14 +99,12 @@ class InferenceResponse: @staticmethod def _from_tritonserver_inference_response( model: _model.Model, - request: TRITONSERVER_InferenceRequest, response, flags: TRITONSERVER_ResponseCompleteFlag, output_memory_type: Optional[DeviceOrMemoryType] = None, ): result = InferenceResponse( model, - request.id, final=(flags == TRITONSERVER_ResponseCompleteFlag.FINAL), ) diff --git a/python/tritonserver/_c/tritonserver_pybind.cc b/python/tritonserver/_c/tritonserver_pybind.cc index b751bb6d7..425efc2d8 100644 --- a/python/tritonserver/_c/tritonserver_pybind.cc +++ b/python/tritonserver/_c/tritonserver_pybind.cc @@ -893,6 +893,8 @@ class PyInferenceRequest } } + void Release() { owned_ = false; } + void SetReleaseCallback() { ThrowIfError(TRITONSERVER_InferenceRequestSetReleaseCallback( @@ -902,10 +904,9 @@ class PyInferenceRequest struct TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp) { - // This wrapper object will be kept alive for the entire inference process, - // so this wrapper will not attempt to destruct the Triton request object - // during inference. Thus, it is ok for now to not update the 'owned' flag - // after passing the Triton request to the core and before it is released. + // This function may be called after the request is deallocated, so the + // request object should not be attempted to be accessed. + ThrowIfError(TRITONSERVER_InferenceRequestDelete(request)); } void SetResponseAllocator() @@ -1159,7 +1160,7 @@ class PyInferenceRequest } // Response management - void GetNextResponse(py::object& py_future) + void GetNextResponse(const py::object& py_future) { std::lock_guard lock(response_mu_); @@ -1167,8 +1168,7 @@ class PyInferenceRequest if (response_future_.get() != nullptr) { throw AlreadyExistsError("cannot call GetNextResponse concurrently"); } - response_future_.reset(new py::object()); - *response_future_ = py_future; + response_future_.reset(new py::object(py_future)); } else { std::pair, const uint32_t>& py_response = responses_.front(); @@ -1186,7 +1186,7 @@ class PyInferenceRequest managed_ptr.reset(new PyInferenceResponse(response, true /* owned */)); } std::pair, const uint32_t> py_response( - managed_ptr, flags); + std::move(managed_ptr), std::move(flags)); { std::lock_guard lock(response_mu_); if (response_future_.get() == nullptr) { @@ -1196,13 +1196,14 @@ class PyInferenceRequest } { py::gil_scoped_acquire gil; - PyFutureSetResult(*response_future_, py_response); - response_future_.reset(nullptr); + std::unique_ptr response_future_local(nullptr); + response_future_.swap(response_future_local); + PyFutureSetResult(*response_future_local, py_response); } } void PyFutureSetResult( - py::object& py_future, + const py::object& py_future, std::pair, const uint32_t>& py_response) { @@ -1634,16 +1635,25 @@ class PyServer : public PyWrapper { void InferAsync( const std::shared_ptr& request, PyTrace& trace) { + request->SetReleaseCallback(); + request->SetResponseAllocator(); + request->SetResponseCallback(); ThrowIfError(TRITONSERVER_ServerInferAsync( triton_object_, request->Ptr(), trace.Ptr())); // Ownership of the internal C object is transferred. + request->Release(); trace.Release(); } void InferAsync(const std::shared_ptr& request) { + request->SetReleaseCallback(); + request->SetResponseAllocator(); + request->SetResponseCallback(); ThrowIfError( TRITONSERVER_ServerInferAsync(triton_object_, request->Ptr(), nullptr)); + // Ownership of the internal C object is transferred. + request->Release(); } }; @@ -1710,10 +1720,6 @@ PyInferenceRequest::PyInferenceRequest( ThrowIfError(TRITONSERVER_InferenceRequestNew( &triton_object_, server.Ptr(), model_name.c_str(), model_version)); owned_ = true; - - SetReleaseCallback(); - SetResponseAllocator(); - SetResponseCallback(); } // [FIXME] module name?