diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index ad04d57f8..42154cdca 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -1722,9 +1722,9 @@ TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading( /// /// \param batcher User-defined placeholder for backend to store and /// retrieve information about the batching strategy for this -/// model.RITONBACKEND_ISPEC return a TRITONSERVER_Error indicating success or -/// failure. \param model The backend model for which Triton is forming a batch. -/// \return a TRITONSERVER_Error indicating success or failure. +/// model. Returns a TRITONSERVER_Error indicating success +/// or failure. \param model The backend model for which Triton is forming a +/// batch. \return a TRITONSERVER_Error indicating success or failure. TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelBatcherInitialize( TRITONBACKEND_Batcher** batcher, TRITONBACKEND_Model* model); diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h index b8e25df72..ef5a45d6a 100644 --- a/include/triton/core/tritonserver.h +++ b/include/triton/core/tritonserver.h @@ -91,7 +91,7 @@ struct TRITONSERVER_MetricFamily; /// } /// #define TRITONSERVER_API_VERSION_MAJOR 1 -#define TRITONSERVER_API_VERSION_MINOR 32 +#define TRITONSERVER_API_VERSION_MINOR 33 /// Get the TRITONBACKEND API version supported by the Triton shared /// library. This value can be compared against the @@ -732,7 +732,8 @@ typedef enum tritonserver_traceactivity_enum { TRITONSERVER_TRACE_REQUEST_END = 6, TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT = 7, TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT = 8, - TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9 + TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9, + TRITONSERVER_TRACE_CUSTOM_ACTIVITY = 10 } TRITONSERVER_InferenceTraceActivity; /// Get the string representation of a trace activity. The returned @@ -838,6 +839,18 @@ TRITONSERVER_InferenceTraceTensorNew( TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn, TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp); +/// Report a trace activity. All the traces reported using this API will be +/// using TRITONSERVER_TRACE_CUSTOM_ACTIVITY type. +/// +/// \param trace The trace object. +/// \param timestamp The timestamp associated with the trace activity. +/// \param name The trace activity name. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONSERVER_DECLSPEC TRITONSERVER_Error* +TRITONSERVER_InferenceTraceReportActivity( + TRITONSERVER_InferenceTrace* trace, uint64_t timestamp, + const char* activity_name); + /// Delete a trace object. /// /// \param trace The trace object. @@ -921,7 +934,6 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_InferenceTraceSetContext( struct TRITONSERVER_InferenceTrace* trace, const char* trace_context); - /// Get TRITONSERVER_InferenceTrace context. /// /// \param trace The trace. diff --git a/python/test/test_api.py b/python/test/test_api.py index 75ca9391e..93124c470 100644 --- a/python/test/test_api.py +++ b/python/test/test_api.py @@ -345,11 +345,6 @@ def test_ready(self): server = tritonserver.Server(self._server_options).start() self.assertTrue(server.ready()) - @pytest.mark.xfail( - tritonserver.__version__ <= "2.48.0", - reason="Known issue on stop: Exit timeout expired. Exiting immediately", - raises=tritonserver.InternalError, - ) def test_stop(self): server = tritonserver.Server(self._server_options).start(wait_until_ready=True) diff --git a/python/tritonserver/_c/tritonserver_pybind.cc b/python/tritonserver/_c/tritonserver_pybind.cc index 127bb15b8..6017b3d7e 100644 --- a/python/tritonserver/_c/tritonserver_pybind.cc +++ b/python/tritonserver/_c/tritonserver_pybind.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -1434,7 +1434,18 @@ class PyServer : public PyWrapper { owned_ = true; } - void Stop() const { ThrowIfError(TRITONSERVER_ServerStop(triton_object_)); } + void Stop() const + { + // ServerStop is blocking for the duration of the server exit timeout, so + // ensure to release the GIL. This can allow request release callbacks + // to be interleaved while server is waiting for live requests/models + // to complete. Without releasing GIL, this function may acquire the GIL + // first and block the Triton request from being released/freed, thus + // blocking the server's shutdown in a circular manner thinking a model is + // still alive. + py::gil_scoped_release release; + ThrowIfError(TRITONSERVER_ServerStop(triton_object_)); + } void RegisterModelRepository( const std::string& repository_path, diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index c6832a3a1..1aa8a9c48 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -36,6 +36,7 @@ #include "backend_config.h" #include "backend_model.h" #include "cuda_utils.h" +#include "infer_stats.h" #include "metrics.h" #include "model_config.pb.h" #include "numa_utils.h" @@ -558,7 +559,8 @@ TritonModelInstance::PrepareRequestsOrRespond( // If any errors occurred, respond with error for each request. if (!status.IsOk()) { for (auto& r : requests) { - InferenceRequest::RespondIfError(r, status, true /* release_requests */); + InferenceRequest::RespondIfError( + r, status, true /* release_requests */, FailureReason::OTHER); } // Log a single error for batch of requests for better visibility LOG_STATUS_ERROR(status, "Requests failed pre-execution checks"); @@ -685,7 +687,16 @@ TritonModelInstance::Execute( for (TRITONBACKEND_Request* tr : triton_requests) { std::unique_ptr ur( reinterpret_cast(tr)); - InferenceRequest::RespondIfError(ur, status, true /* release_requests */); + // NOTE: If a backend both returns an error in + // TRITONBACKEND_ModelInstanceExecute and reports an error with + // TRITONBACKEND_ModelInstanceReportStatistics, this can result in double + // counting of the failure metric for the same request. However, it is + // currently not expected for this to be a common case, as the return + // value of TRITONBACKEND_ModelInstanceExecute is used to express + // ownership of the request rather than success of an inference request. + // See tritonbackend.h for more details on this. + InferenceRequest::RespondIfError( + ur, status, true /* release_requests */, FailureReason::BACKEND); } TRITONSERVER_ErrorDelete(err); diff --git a/src/dynamic_batch_scheduler.cc b/src/dynamic_batch_scheduler.cc index ac7aa1276..b5f8ac825 100644 --- a/src/dynamic_batch_scheduler.cc +++ b/src/dynamic_batch_scheduler.cc @@ -50,11 +50,12 @@ IsStaleState(Payload::State payload_state) void FinishSkippedRequests( std::vector>>&& requests, - const Status& response_status) + const Status& response_status, FailureReason reason) { for (auto& queue : requests) { for (auto& request : queue) { - InferenceRequest::RespondIfError(request, response_status, true); + InferenceRequest::RespondIfError( + request, response_status, true /* release_requests */, reason); } } } @@ -69,8 +70,10 @@ FinishRejectedCancelledRequests( const static Status rejected_status = Status(Status::Code::UNAVAILABLE, "Request timeout expired"); const static Status cancelled_status = Status(Status::Code::CANCELLED); - FinishSkippedRequests(std::move(rejected_requests), rejected_status); - FinishSkippedRequests(std::move(cancelled_requests), cancelled_status); + FinishSkippedRequests( + std::move(rejected_requests), rejected_status, FailureReason::REJECTED); + FinishSkippedRequests( + std::move(cancelled_requests), cancelled_status, FailureReason::CANCELED); } DynamicBatchScheduler::DynamicBatchScheduler( diff --git a/src/ensemble_scheduler/ensemble_scheduler.cc b/src/ensemble_scheduler/ensemble_scheduler.cc index a16044062..b16567dd7 100644 --- a/src/ensemble_scheduler/ensemble_scheduler.cc +++ b/src/ensemble_scheduler/ensemble_scheduler.cc @@ -81,23 +81,26 @@ class RequestTracker { std::lock_guard lk(mtx_); inflight_request_counter_--; if (inflight_request_counter_ == 0) { + if (request_ != nullptr) { #ifdef TRITON_ENABLE_STATS - const auto& infer_stats = context_stats_aggregator_.ImmutableInferStats(); - request_->ReportStatisticsWithDuration( - metric_reporter_, status_.IsOk(), compute_start_ns_, - infer_stats.compute_input_duration_ns_, - infer_stats.compute_infer_duration_ns_, - infer_stats.compute_output_duration_ns_); - if (status_.IsOk()) { - stats_aggregator_->UpdateInferBatchStatsWithDuration( - metric_reporter_, std::max(1U, request_->BatchSize()), + const auto& infer_stats = + context_stats_aggregator_.ImmutableInferStats(); + request_->ReportStatisticsWithDuration( + metric_reporter_, status_.IsOk(), compute_start_ns_, infer_stats.compute_input_duration_ns_, infer_stats.compute_infer_duration_ns_, infer_stats.compute_output_duration_ns_); - } + if (status_.IsOk()) { + stats_aggregator_->UpdateInferBatchStatsWithDuration( + metric_reporter_, std::max(1U, request_->BatchSize()), + infer_stats.compute_input_duration_ns_, + infer_stats.compute_infer_duration_ns_, + infer_stats.compute_output_duration_ns_); + } #endif - InferenceRequest::Release( - std::move(request_), TRITONSERVER_REQUEST_RELEASE_ALL); + InferenceRequest::Release( + std::move(request_), TRITONSERVER_REQUEST_RELEASE_ALL); + } } return (inflight_request_counter_ == 0); } @@ -1136,7 +1139,8 @@ EnsembleContext::FinishEnsemble(std::unique_ptr&& response) "more " "ensemble steps can be made"); InferenceRequest::RespondIfError( - request_tracker_->Request(), ensemble_status_); + request_tracker_->Request(), ensemble_status_, + false /* release_requests */, FailureReason::OTHER); } else { request_tracker_->Request()->ResponseFactory()->SendFlags( TRITONSERVER_RESPONSE_COMPLETE_FINAL); @@ -1149,7 +1153,8 @@ EnsembleContext::FinishEnsemble(std::unique_ptr&& response) ensemble_status_); } else { InferenceRequest::RespondIfError( - request_tracker_->Request(), ensemble_status_); + request_tracker_->Request(), ensemble_status_, + false /* release_requests */, FailureReason::OTHER); } } diff --git a/src/infer_request.cc b/src/infer_request.cc index 1ad567075..3d93de31d 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -421,10 +421,25 @@ InferenceRequest::Run(std::unique_ptr& request) return status; } +FailureReason +stringToFailureReason(const std::string& error_type) +{ + if (error_type == "REJECTED") { + return FailureReason::REJECTED; + } + if (error_type == "CANCELED") { + return FailureReason::CANCELED; + } + if (error_type == "BACKEND") { + return FailureReason::BACKEND; + } + return FailureReason::OTHER; +} + void InferenceRequest::RespondIfError( std::unique_ptr& request, const Status& status, - const bool release_request) + const bool release_request, FailureReason reason) { if (status.IsOk()) { return; @@ -442,7 +457,10 @@ InferenceRequest::RespondIfError( InferenceResponse::SendWithStatus( std::move(response), TRITONSERVER_RESPONSE_COMPLETE_FINAL, status), (request->LogRequest() + "failed to send error response").c_str()); - +#ifdef TRITON_ENABLE_STATS + request->ReportErrorStatistics( + request->model_raw_->MetricReporter().get(), reason); +#endif // If releasing the request then invoke the release callback which // gives ownership to the callback. So can't access 'request' after // this point. @@ -452,20 +470,6 @@ InferenceRequest::RespondIfError( } } -void -InferenceRequest::RespondIfError( - std::vector>& requests, - const Status& status, const bool release_requests) -{ - if (status.IsOk()) { - return; - } - - for (auto& request : requests) { - RespondIfError(request, status, release_requests); - } -} - Status InferenceRequest::Release( std::unique_ptr&& request, const uint32_t release_flags) @@ -1389,6 +1393,21 @@ InferenceRequest::ValidateBytesInputs( } #ifdef TRITON_ENABLE_STATS + +void +InferenceRequest::ReportErrorStatistics( + MetricModelReporter* metric_reporter, FailureReason reason) +{ + INFER_STATS_DECL_TIMESTAMP(request_end_ns); + model_raw_->MutableStatsAggregator()->UpdateFailure( + metric_reporter, request_start_ns_, request_end_ns, reason); + if (secondary_stats_aggregator_ != nullptr) { + secondary_stats_aggregator_->UpdateFailure( + nullptr /* metric_reporter */, request_start_ns_, request_end_ns, + reason); + } +} + void InferenceRequest::ReportStatistics( MetricModelReporter* metric_reporter, bool success, @@ -1425,10 +1444,12 @@ InferenceRequest::ReportStatistics( } } else { model_raw_->MutableStatsAggregator()->UpdateFailure( - metric_reporter, request_start_ns_, request_end_ns); + metric_reporter, request_start_ns_, request_end_ns, + FailureReason::BACKEND); if (secondary_stats_aggregator_ != nullptr) { secondary_stats_aggregator_->UpdateFailure( - nullptr /* metric_reporter */, request_start_ns_, request_end_ns); + nullptr /* metric_reporter */, request_start_ns_, request_end_ns, + FailureReason::BACKEND); } } } @@ -1461,10 +1482,12 @@ InferenceRequest::ReportStatisticsWithDuration( } } else { model_raw_->MutableStatsAggregator()->UpdateFailure( - metric_reporter, request_start_ns_, request_end_ns); + metric_reporter, request_start_ns_, request_end_ns, + FailureReason::OTHER); if (secondary_stats_aggregator_ != nullptr) { secondary_stats_aggregator_->UpdateFailure( - nullptr /* metric_reporter */, request_start_ns_, request_end_ns); + nullptr /* metric_reporter */, request_start_ns_, request_end_ns, + FailureReason::OTHER); } } } @@ -1868,5 +1891,4 @@ operator!=( { return !(lhs == rhs); } - }} // namespace triton::core diff --git a/src/infer_request.h b/src/infer_request.h index ec30d9ca3..a38b141af 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -590,7 +590,8 @@ class InferenceRequest { // 'release_request' is true 'request' is returned as nullptr. static void RespondIfError( std::unique_ptr& request, const Status& status, - const bool release_request = false); + const bool release_request = false, + FailureReason reason = FailureReason::OTHER); // Send an error response to a set of 'requests'. If 'status' is // Success then no responses are sent and the requests are not @@ -603,7 +604,8 @@ class InferenceRequest { // returned with all nullptrs. static void RespondIfError( std::vector>& requests, - const Status& status, const bool release_requests = false); + const Status& status, const bool release_requests = false, + FailureReason reason = FailureReason::OTHER); // Release the request. Call the release callback and transfer // ownership of the request to the callback. On return 'request' is @@ -673,6 +675,16 @@ class InferenceRequest { const uint64_t compute_start_ns, const uint64_t compute_input_end_ns, const uint64_t compute_output_start_ns, const uint64_t compute_end_ns); + // Report the error statistics to stats collectors associated with the + // request. + // FIXME: A separate function may not be necessary here, but is being used + // cautiously in case of unforeseen issues such as possibly capturing a trace + // twice. This should be revisited and better tested to see if the + // ReportStatistics function can be used as-is for the newly captured failure + // cases. + void ReportErrorStatistics( + MetricModelReporter* metric_reporter, FailureReason reason); + // Report the statistics to stats collectors associated with the request. // Duration and timestamps provide two granularities for stats collectors. void ReportStatisticsWithDuration( diff --git a/src/infer_stats.cc b/src/infer_stats.cc index 68cf70a0c..47ab309cb 100644 --- a/src/infer_stats.cc +++ b/src/infer_stats.cc @@ -36,10 +36,28 @@ namespace triton { namespace core { #ifdef TRITON_ENABLE_STATS +// This function converts FailureReason enum values to std::string +std::string +failureReasonToString(FailureReason reason) +{ + switch (reason) { + case FailureReason::REJECTED: + return "REJECTED"; + case FailureReason::CANCELED: + return "CANCELED"; + case FailureReason::BACKEND: + return "BACKEND"; + case FailureReason::OTHER: + return "OTHER"; + default: + return "OTHER"; + } +} + void InferenceStatsAggregator::UpdateFailure( MetricModelReporter* metric_reporter, const uint64_t request_start_ns, - const uint64_t request_end_ns) + const uint64_t request_end_ns, FailureReason reason) { std::lock_guard lock(mu_); @@ -48,7 +66,8 @@ InferenceStatsAggregator::UpdateFailure( #ifdef TRITON_ENABLE_METRICS if (metric_reporter != nullptr) { - metric_reporter->IncrementCounter("inf_failure", 1); + std::string reason_str = failureReasonToString(reason); + metric_reporter->IncrementCounter("inf_failure_" + reason_str, 1); } #endif // TRITON_ENABLE_METRICS } diff --git a/src/infer_stats.h b/src/infer_stats.h index 66b3659bd..15b173f45 100644 --- a/src/infer_stats.h +++ b/src/infer_stats.h @@ -39,6 +39,9 @@ namespace triton { namespace core { +// Define the FailureReason enum within the triton::core namespace +enum class FailureReason { REJECTED, CANCELED, BACKEND, OTHER }; + class MetricModelReporter; @@ -136,7 +139,7 @@ class InferenceStatsAggregator { // Add durations to Infer stats for a failed inference request. void UpdateFailure( MetricModelReporter* metric_reporter, const uint64_t request_start_ns, - const uint64_t request_end_ns); + const uint64_t request_end_ns, FailureReason reason); // Add durations to infer stats for a successful inference request. void UpdateSuccess( diff --git a/src/infer_trace.cc b/src/infer_trace.cc index cce46e262..4301e2c5b 100644 --- a/src/infer_trace.cc +++ b/src/infer_trace.cc @@ -26,6 +26,13 @@ #include "infer_trace.h" +#define TRITONJSON_STATUSTYPE triton::core::Status +#define TRITONJSON_STATUSRETURN(M) \ + return triton::core::Status(triton::core::Status::Code::INTERNAL, (M)) +#define TRITONJSON_STATUSSUCCESS triton::core::Status::Success +#include "triton/common/logging.h" +#include "triton/common/triton_json.h" + namespace triton { namespace core { #ifdef TRITON_ENABLE_TRACING @@ -48,6 +55,26 @@ InferenceTrace::Release() release_fn_(reinterpret_cast(this), userp_); } +void +InferenceTrace::RecordActivityName( + uint64_t timestamp_ns, std::string activity_name) +{ + std::lock_guard lock(mu_); + triton::common::TritonJson::Value context_json( + triton::common::TritonJson::ValueType::OBJECT); + if (!context_.empty()) { + Status status = context_json.Parse(context_); + if (!status.IsOk()) { + LOG_ERROR << "Error parsing trace context"; + } + } + std::string key = std::to_string(timestamp_ns); + context_json.SetStringObject(key.c_str(), activity_name); + triton::common::TritonJson::WriteBuffer buffer; + context_json.Write(&buffer); + context_ = buffer.Contents(); +} + std::shared_ptr InferenceTraceProxy::SpawnChildTrace() { diff --git a/src/infer_trace.h b/src/infer_trace.h index 4de6df788..4f16cf380 100644 --- a/src/infer_trace.h +++ b/src/infer_trace.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "constants.h" #include "status.h" @@ -69,12 +70,17 @@ class InferenceTrace { void SetModelVersion(int64_t v) { model_version_ = v; } void SetRequestId(const std::string& request_id) { request_id_ = request_id; } void SetContext(const std::string& context) { context_ = context; } + void RecordActivityName(uint64_t timestamp_ns, std::string activity_name); // Report trace activity. void Report( - const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns) + const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns, + std::string activity_name = "") { if ((level_ & TRITONSERVER_TRACE_LEVEL_TIMESTAMPS) > 0) { + if (activity == TRITONSERVER_TRACE_CUSTOM_ACTIVITY) { + RecordActivityName(timestamp_ns, activity_name); + } activity_fn_( reinterpret_cast(this), activity, timestamp_ns, userp_); @@ -82,13 +88,15 @@ class InferenceTrace { } // Report trace activity at the current time. - void ReportNow(const TRITONSERVER_InferenceTraceActivity activity) + void ReportNow( + const TRITONSERVER_InferenceTraceActivity activity, + std::string activity_name = "") { if ((level_ & TRITONSERVER_TRACE_LEVEL_TIMESTAMPS) > 0) { - Report( - activity, std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()) - .count()); + auto now = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + Report(activity, now, activity_name); } } @@ -128,6 +136,7 @@ class InferenceTrace { // across traces static std::atomic next_id_; std::string context_; + std::mutex mu_; }; // @@ -152,6 +161,10 @@ class InferenceTraceProxy { void SetRequestId(const std::string& n) { trace_->SetRequestId(n); } void SetModelVersion(int64_t v) { trace_->SetModelVersion(v); } void SetContext(const std::string& context) { trace_->SetContext(context); } + void RecordActivityName(uint64_t timestamp_ns, std::string activity_name) + { + trace_->RecordActivityName(timestamp_ns, activity_name); + } void Report( const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns) diff --git a/src/metric_model_reporter.cc b/src/metric_model_reporter.cc index be43da844..9dd9122be 100644 --- a/src/metric_model_reporter.cc +++ b/src/metric_model_reporter.cc @@ -29,6 +29,7 @@ #ifdef TRITON_ENABLE_METRICS #include "constants.h" +#include "infer_stats.h" #include "triton/common/logging.h" // Global config group has 'name' of empty string. @@ -101,6 +102,13 @@ MetricReporterConfig::ParseQuantiles(std::string options) // // MetricModelReporter // +const std::map + MetricModelReporter::failure_reasons_map = { + {FailureReason::REJECTED, "REJECTED"}, + {FailureReason::CANCELED, "CANCELED"}, + {FailureReason::BACKEND, "BACKEND"}, + {FailureReason::OTHER, "OTHER"}}; + Status MetricModelReporter::Create( const ModelIdentifier& model_id, const int64_t model_version, @@ -189,7 +197,6 @@ MetricModelReporter::InitializeCounters( { // Always setup these counters, regardless of config counter_families_["inf_success"] = &Metrics::FamilyInferenceSuccess(); - counter_families_["inf_failure"] = &Metrics::FamilyInferenceFailure(); counter_families_["inf_count"] = &Metrics::FamilyInferenceCount(); counter_families_["inf_exec_count"] = &Metrics::FamilyInferenceExecutionCount(); @@ -227,6 +234,15 @@ MetricModelReporter::InitializeCounters( counters_[name] = CreateMetric(*family_ptr, labels); } } + + // Initialize failure metrics with reasons + for (const auto& reason_pair : failure_reasons_map) { + std::map extended_labels = labels; + extended_labels["reason"] = reason_pair.second; + counters_["inf_failure_" + reason_pair.second] = + CreateMetric( + Metrics::FamilyInferenceFailure(), extended_labels); + } } void diff --git a/src/metric_model_reporter.h b/src/metric_model_reporter.h index 5e2e073cf..9378905ae 100644 --- a/src/metric_model_reporter.h +++ b/src/metric_model_reporter.h @@ -94,6 +94,8 @@ class MetricModelReporter { // Lookup summary metric by name, and observe the value if it exists. void ObserveSummary(const std::string& name, double value); + static const std::map failure_reasons_map; + private: MetricModelReporter( const ModelIdentifier& model_id, const int64_t model_version, diff --git a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc index cf3691959..74314e7ab 100644 --- a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc +++ b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc @@ -1,4 +1,4 @@ -// Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -91,7 +91,9 @@ CancelRequests(std::vector>&& requests) LOG_ERROR << status.Message(); } // Respond the request as cancelled. - InferenceRequest::RespondIfError(req, cancelled_status, true); + InferenceRequest::RespondIfError( + req, cancelled_status, true /* release_requests */, + FailureReason::CANCELED); } } @@ -1173,7 +1175,9 @@ SequenceBatchScheduler::ReaperThread(const int nice) "timeout of the corresponding sequence has been expired"); for (auto& backlog : expired_backlogs) { for (auto& req : *backlog->queue_) { - InferenceRequest::RespondIfError(req, rejected_status, true); + InferenceRequest::RespondIfError( + req, rejected_status, true /* release_requests */, + FailureReason::REJECTED); } } } diff --git a/src/test/async_work_queue_test.cc b/src/test/async_work_queue_test.cc index 1d7b3be90..735086363 100644 --- a/src/test/async_work_queue_test.cc +++ b/src/test/async_work_queue_test.cc @@ -91,7 +91,6 @@ TEST_F(AsyncWorkQueueTest, WorkerCountInitialized) << "Expect 4 worker count for initialized queue"; } - TEST_F(AsyncWorkQueueTest, RunTasksInParallel) { auto AddTwoFn = [](const std::vector& lhs, const std::vector& rhs, @@ -181,10 +180,11 @@ TEST_F(AsyncWorkQueueTest, RunTasksInParallel) .count(); parallelized_duration = end_ts - start_ts; - // FIXME manual testing shows parallelized time is between 30% to 33.3% for - // 128 M total elements - EXPECT_LT(parallelized_duration, serialized_duration / 3) - << "Expected parallelized work was completed within 1/3 of serialized " + // FIXME manual testing shows parallelized time is between 30% to 33.3% + // for 128 M total elements, but is flaky in CI so relax the comparison + // for parallel to simply be faster than serial by any amount. + EXPECT_LT(parallelized_duration, serialized_duration) + << "Expected parallelized work was completed faster than serialized " "time"; for (size_t count = 0; count < task_count; count++) { auto res = std::move(fs[count].get()); diff --git a/src/tritonserver.cc b/src/tritonserver.cc index eae83ef2f..82642d5dc 100644 --- a/src/tritonserver.cc +++ b/src/tritonserver.cc @@ -950,6 +950,8 @@ TRITONSERVER_InferenceTraceActivityString( return "TENSOR_BACKEND_INPUT"; case TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT: return "TENSOR_BACKEND_OUTPUT"; + case TRITONSERVER_TRACE_CUSTOM_ACTIVITY: + return "CUSTOM_ACTIVITY"; } return ""; @@ -1115,6 +1117,23 @@ TRITONSERVER_InferenceTraceSpawnChildTrace( #endif // TRITON_ENABLE_TRACING } +TRITONSERVER_DECLSPEC TRITONSERVER_Error* +TRITONSERVER_InferenceTraceReportActivity( + TRITONSERVER_InferenceTrace* trace, uint64_t timestamp, + const char* activity_name) +{ +#ifdef TRITON_ENABLE_TRACING + tc::InferenceTrace* ltrace = reinterpret_cast(trace); + if (trace != nullptr) { + ltrace->Report( + TRITONSERVER_TRACE_CUSTOM_ACTIVITY, timestamp, activity_name); + } + return nullptr; // Success +#else + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, "inference tracing not supported"); +#endif // TRITON_ENABLE_TRACING +} TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceSetContext( diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc index b8449d1bc..cd1e03e15 100644 --- a/src/tritonserver_stub.cc +++ b/src/tritonserver_stub.cc @@ -1115,6 +1115,11 @@ TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup() { } +TRITONAPI_DECLSPEC void +TRITONSERVER_InferenceTraceReportActivity() +{ +} + TRITONAPI_DECLSPEC void TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading() {