diff --git a/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/all_models/inflight_batcher_llm/ensemble/config.pbtxt index 4c0b28e2..99f0a6c2 100755 --- a/all_models/inflight_batcher_llm/ensemble/config.pbtxt +++ b/all_models/inflight_batcher_llm/ensemble/config.pbtxt @@ -241,6 +241,10 @@ ensemble_scheduling { key: "TOKENS_BATCH" value: "_TOKENS_BATCH" } + input_map { + key: "REQUEST_INPUT_LEN" + value: "_REQUEST_INPUT_LEN" + } input_map { key: "SEQUENCE_LENGTH" value: "_SEQUENCE_LENGTH" diff --git a/all_models/inflight_batcher_llm/postprocessing/1/model.py b/all_models/inflight_batcher_llm/postprocessing/1/model.py index 6f503a0c..8fe50c11 100644 --- a/all_models/inflight_batcher_llm/postprocessing/1/model.py +++ b/all_models/inflight_batcher_llm/postprocessing/1/model.py @@ -109,6 +109,8 @@ def execute(self, requests): tokens_batch = pb_utils.get_input_tensor_by_name( request, 'TOKENS_BATCH').as_numpy() + request_input_lens = pb_utils.get_input_tensor_by_name( + request, 'REQUEST_INPUT_LEN').as_numpy() # Get sequence length sequence_lengths = pb_utils.get_input_tensor_by_name( request, 'SEQUENCE_LENGTH').as_numpy() @@ -118,7 +120,7 @@ def execute(self, requests): # tokens_batch = tokens_batch.T # Postprocessing output data. - outputs = self._postprocessing(tokens_batch, sequence_lengths) + outputs = self._postprocessing(tokens_batch, request_input_lens, sequence_lengths) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. @@ -148,11 +150,12 @@ def finalize(self): """ print('Cleaning up...') - def _postprocessing(self, tokens_batch, sequence_lengths): + def _postprocessing(self, tokens_batch, request_input_lens, sequence_lengths): outputs = [] for batch_idx, beam_tokens in enumerate(tokens_batch): for beam_idx, tokens in enumerate(beam_tokens): seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode(tokens[:seq_len]) + request_input_len = request_input_lens[batch_idx].tolist()[0] + output = self.tokenizer.decode(tokens[request_input_len:max(seq_len-1, 1)]) outputs.append(output.encode('utf8')) return outputs diff --git a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt index 3aca71fd..9422e5d2 100755 --- a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt +++ b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt @@ -33,6 +33,11 @@ input [ data_type: TYPE_INT32 dims: [ -1, -1 ] }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, { name: "SEQUENCE_LENGTH" data_type: TYPE_INT32 diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt index cd34834b..a4252a3a 100644 --- a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt @@ -29,7 +29,7 @@ backend: "tensorrtllm" max_batch_size: 128 model_transaction_policy { - decoupled: ${decoupled_mode} + decoupled: False } input [ diff --git a/tools/inflight_batcher_llm/end_to_end_test.py b/tools/inflight_batcher_llm/end_to_end_test.py index 984d4a6b..41ff9ec7 100644 --- a/tools/inflight_batcher_llm/end_to_end_test.py +++ b/tools/inflight_batcher_llm/end_to_end_test.py @@ -46,13 +46,13 @@ def test_functionality(client, prompts, output_lens): ] result = client.infer(model_name, inputs, request_id=str(i)) output0 = result.as_numpy("INPUT_ID") - output1 = result.as_numpy("REQUEST_INPUT_LEN") + request_input_len = result.as_numpy("REQUEST_INPUT_LEN") output2 = result.as_numpy("REQUEST_OUTPUT_LEN") model_name = "tensorrt_llm" inputs = [ utils.prepare_tensor("input_ids", output0, FLAGS.protocol), - utils.prepare_tensor("input_lengths", output1, FLAGS.protocol), + utils.prepare_tensor("input_lengths", request_input_len, FLAGS.protocol), utils.prepare_tensor("request_output_len", output2, FLAGS.protocol), ] @@ -63,11 +63,12 @@ def test_functionality(client, prompts, output_lens): model_name = "postprocessing" inputs = [ utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol), + utils.prepare_tensor("REQUEST_INPUT_LEN", request_input_len, FLAGS.protocol), utils.prepare_tensor("SEQUENCE_LENGTH", seq_lengths, FLAGS.protocol) ] inputs[0].set_data_from_numpy(output0) - inputs[1].set_data_from_numpy(seq_lengths) + inputs[2].set_data_from_numpy(seq_lengths) result = client.infer(model_name, inputs, request_id=str(i)) output0 = result.as_numpy("OUTPUT")