Skip to content

Commit

Permalink
Merge branch 'main' of github.com:triton-inference-server/server into…
Browse files Browse the repository at this point in the history
… yinggeh-DLIS-6657-client-input-byte-size-check
  • Loading branch information
yinggeh committed Jul 23, 2024
2 parents 39715f4 + 70a0eee commit 8d5f411
Show file tree
Hide file tree
Showing 20 changed files with 732 additions and 263 deletions.
29 changes: 12 additions & 17 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
"2024.0.0", # ORT OpenVINO
"2024.0.0", # Standalone OpenVINO
"3.2.6", # DCGM version
"0.4.3", # vLLM version
"0.5.0.post1", # vLLM version
)
}

Expand Down Expand Up @@ -1082,25 +1082,20 @@ def create_dockerfile_linux(
"""
if "tensorrtllm" in backends:
df += """
RUN ldconfig
# Remove contents that are not needed in runtime
RUN ARCH="$(uname -i)" \\
&& rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data \\
&& rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python \\
&& rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples
# Install required packages for TRT-LLM models
RUN python3 -m pip install --upgrade pip \\
&& pip3 install transformers
# ldconfig for TRT-LLM
RUN find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf
RUN find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf
# Remove contents that are not needed in runtime
# Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
# The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
RUN pip3 install setuptools==69.5.1 grpcio-tools==1.64.0
RUN ldconfig && \
ARCH="$(uname -i)" && \
rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
python3 -m pip install --upgrade pip && \
pip3 install --no-cache-dir transformers && \
find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0
ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
"""
Expand Down
12 changes: 9 additions & 3 deletions docs/protocol/extension_generate.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!--
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -87,10 +87,12 @@ return an error.

$generate_request =
{
"id" : $string, #optional
"text_input" : $string,
"parameters" : $parameters #optional
}

* "id": An identifier for this request. Optional, but if specified this identifier must be returned in the response.
* "text_input" : The text input that the model should generate output from.
* "parameters" : An optional object containing zero or more parameters for this
generate request expressed as key/value pairs. See
Expand Down Expand Up @@ -121,14 +123,15 @@ specification to set the parameters.
Below is an example to send generate request with additional model parameters `stream` and `temperature`.

```
$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
$ curl -X POST localhost:8000/v2/models/mymodel/generate -d '{"id": "42", "text_input": "client input", "parameters": {"stream": false, "temperature": 0}}'
POST /v2/models/mymodel/generate HTTP/1.1
Host: localhost:8000
Content-Type: application/json
Content-Length: <xx>
{
"text_input": "client input",
"id" : "42",
"text_input" : "client input",
"parameters" :
{
"stream": false,
Expand All @@ -145,11 +148,13 @@ the HTTP body.

$generate_response =
{
"id" : $string
"model_name" : $string,
"model_version" : $string,
"text_output" : $string
}

* "id" : The "id" identifier given in the request, if any.
* "model_name" : The name of the model used for inference.
* "model_version" : The specific model version used for inference.
* "text_output" : The output of the inference.
Expand All @@ -159,6 +164,7 @@ the HTTP body.
```
200
{
"id" : "42"
"model_name" : "mymodel",
"model_version" : "1",
"text_output" : "model output"
Expand Down
19 changes: 19 additions & 0 deletions docs/user_guide/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,25 @@ Count*. The count metrics are illustrated by the following examples:
| |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request|
| |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request|

#### Failure Count Categories

| Failed Request Reason |Description |
|------------|------------|
| REJECTED | Number of inference failures due to request timeout in the schedular. |
| CANCELED | Number of inference failures due to request cancellation in the core. |
| BACKEND | Number of inference failures during execution of requests in the backend/model. |
| OTHER | Number of inference failures due to other uncategorized reasons in the core. |

> **Note**
>
> Ensemble failure metrics will reflect the failure counts of their composing models as well as the parent model, but currently do not capture the same granularity for the "reason" label and will default to the "OTHER" reason.
>
> For example, if EnsembleA contains ModelA, and ModelA experiences a failed request due to a queue/backlog timeout in the scheduler, ModelA will have a failed request metric reflecting `reason=REJECTED` and `count=1`.
> Additionally, EnsembleA will have a failed request metric reflecting `reason=OTHER` and `count=2`.
> The `count=2` reflects 1 from the internally failed request captured by ModelA, as well as 1 from the failed top-level request sent to EnsembleA by the user/client.
> The `reason=OTHER` reflects that fact that the ensemble doesn't currently capture the specific reason why
> ModelA's request failed at this time.
#### Pending Request Count (Queue Size) Per-Model

The *Pending Request Count* reflects the number of requests that have been
Expand Down
16 changes: 16 additions & 0 deletions docs/user_guide/trace.md
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,22 @@ Then, you can specify headers in the `infer` method. For references, please
look at our [tests](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py),
e.g. [http context propagation test](https://github.com/triton-inference-server/server/blob/main/qa/L0_trace/opentelemetry_unittest.py#L494-L508).

### Custom Backend Tracing

In the case when a custom activity needs to be traced in the backend, please
use `TRITONSERVER_InferenceTraceReportActivity` API. For examples, please
refer to the [identity backend](https://github.com/triton-inference-server/identity_backend/blob/main/src/identity.cc).

In `openTelemetry` trace mode, if one wishes to start a new span, make sure
that the name of your custom activity ends with `_START`. To end the new span,
make sure that corresponding activity ends with `_END`. For example, in the
identity backend, we start a `CUSTOM_ACTIVITY` span, by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L872-L876)
`CUSTOM_ACTIVITY_START` event; and we close this span by [reporting](https://github.com/triton-inference-server/identity_backend/blob/oandreeva-custom-trace-activity/src/identity.cc#L880-L883)
`CUSTOM_ACTIVITY_END` event.

Please note, that it is user's responsibility to make sure that all custom started
spans are properly ended.

### Limitations

- OpenTelemetry trace mode is not supported on Windows systems.
Expand Down
2 changes: 2 additions & 0 deletions qa/L0_backend_python/env/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
aws s3 rb "${BUCKET_URL}" --force || true
exit 1
fi

Expand Down Expand Up @@ -286,6 +287,7 @@ run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
aws s3 rb "${BUCKET_URL}" --force || true
exit 1
fi

Expand Down
35 changes: 34 additions & 1 deletion qa/L0_backend_python/lifecycle/lifecycle_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import os
import re
import sys

import requests

sys.path.append("../../common")

import queue
Expand Down Expand Up @@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase):
def setUp(self):
self._shm_leak_detector = shm_util.ShmLeakDetector()

def _get_metrics(self):
metrics_url = "http://localhost:8002/metrics"
r = requests.get(metrics_url)
r.raise_for_status()
return r.text

def _metrics_before_test(self, model, reason):
pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
metrics = self._get_metrics()
match = re.search(pattern, metrics)
if match:
return int(match.group(1))
else:
raise Exception(f"Failure metrics for model='{model}' not found")

def _assert_metrics(
self, model_name, reason, expected_count_increase, initial_count
):
metrics = self._get_metrics()
# Add initial count + expected count for the the test
expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
self.assertIn(expected_metric, metrics)

def test_error_code(self):
model_name = "error_code"
shape = [1, 1]
Expand Down Expand Up @@ -181,7 +207,7 @@ def test_batch_error(self):
def test_infer_pymodel_error(self):
model_name = "wrong_model"
shape = [2, 2]

initial_metrics_value = self._metrics_before_test(model_name, "BACKEND")
with self._shm_leak_detector.Probe() as shm_probe:
with httpclient.InferenceServerClient(
f"{_tritonserver_ipaddr}:8000"
Expand All @@ -207,6 +233,13 @@ def test_infer_pymodel_error(self):
self.assertTrue(
False, "Wrong exception raised or did not raise an exception"
)
expected_count_increase = 1
self._assert_metrics(
model_name,
"BACKEND",
expected_count_increase,
initial_metrics_value,
)


if __name__ == "__main__":
Expand Down
32 changes: 23 additions & 9 deletions qa/L0_grpc_state_cleanup/cleanup_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,10 +437,10 @@ def test_simple_infer_error_status(self):

def test_simple_infer_shutdownserver(self):
# This test case is used to check whether all the state objects are
# released when the server is interrupted to shutdown in middle of
# inference run with final parameters being returned.
# released when the server is interrupted to shutdown in the beginning
# of inference run with final parameters being returned.
with self.assertRaises(InferenceServerException) as cm:
self._simple_infer(request_count=10, kill_server=5)
self._simple_infer(request_count=20, kill_server=5)

###
### Streaming Tests
Expand Down Expand Up @@ -469,11 +469,18 @@ def test_streaming_timeout(self):
def test_streaming_error_status(self):
# This test case is used to check whether all the state objects are
# released when RPC runs into error.
expected_exceptions = [
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
"The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
]
with self.assertRaises(InferenceServerException) as cm:
self._streaming_infer(request_count=10, should_error=True)
self.assertIn(
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
str(cm.exception),

exception_match = False
for expected_exception in expected_exceptions:
exception_match |= expected_exception in str(cm.exception)
self.assertTrue(
exception_match, "Raised unexpected exception {}".format(str(cm.exception))
)

def test_streaming_infer_shutdownserver(self):
Expand Down Expand Up @@ -520,11 +527,18 @@ def test_decoupled_timeout(self):
def test_decoupled_error_status(self):
# This test case is used to check whether all the state objects are
# released when RPC runs into error.
expected_exceptions = [
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
"The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
]
with self.assertRaises(InferenceServerException) as cm:
self._decoupled_infer(request_count=10, repeat_count=10, should_error=True)
self.assertIn(
"This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
str(cm.exception),

exception_match = False
for expected_exception in expected_exceptions:
exception_match |= expected_exception in str(cm.exception)
self.assertTrue(
exception_match, "Raised unexpected exception {}".format(str(cm.exception))
)

def test_decoupled_infer_shutdownserver(self):
Expand Down
43 changes: 43 additions & 0 deletions qa/L0_http/generate_endpoint_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,49 @@ def test_generate(self):
self.assertIn("TEXT", data)
self.assertEqual(text, data["TEXT"])

def test_request_id(self):
# Setup text based input
text = "hello world"
request_id = "42"

# Test when request id in request body
inputs = {"PROMPT": text, "id": request_id, "STREAM": False}
r = self.generate(self._model_name, inputs)
r.raise_for_status()

self.assertIn("Content-Type", r.headers)
self.assertEqual(r.headers["Content-Type"], "application/json")

data = r.json()
self.assertIn("id", data)
self.assertEqual(request_id, data["id"])
self.assertIn("TEXT", data)
self.assertEqual(text, data["TEXT"])

# Test when request id not in request body
inputs = {"PROMPT": text, "STREAM": False}
r = self.generate(self._model_name, inputs)
r.raise_for_status()

self.assertIn("Content-Type", r.headers)
self.assertEqual(r.headers["Content-Type"], "application/json")

data = r.json()
self.assertNotIn("id", data)

# Test when request id is empty
inputs = {"PROMPT": text, "id": "", "STREAM": False}
r = self.generate(self._model_name, inputs)
r.raise_for_status()

self.assertIn("Content-Type", r.headers)
self.assertEqual(r.headers["Content-Type"], "application/json")

data = r.json()
self.assertNotIn("id", data)
self.assertIn("TEXT", data)
self.assertEqual(text, data["TEXT"])

def test_generate_stream(self):
# Setup text-based input
text = "hello world"
Expand Down
2 changes: 1 addition & 1 deletion qa/L0_http/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ fi
## Python Unit Tests
TEST_RESULT_FILE='test_results.txt'
PYTHON_TEST=generate_endpoint_test.py
EXPECTED_NUM_TESTS=15
EXPECTED_NUM_TESTS=16
set +e
python $PYTHON_TEST >$CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
Expand Down
Loading

0 comments on commit 8d5f411

Please sign in to comment.