From 88c55b3c3e2c37dc092f05a79eb5aae2d99e8470 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 6 Jan 2025 12:21:43 +0000 Subject: [PATCH 1/5] perf(guidellm): restrict sweep runs --- .../text-generation-inference/performance/benchmark.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmark/text-generation-inference/performance/benchmark.sh b/benchmark/text-generation-inference/performance/benchmark.sh index 2e3a1f60e..f510f64f3 100755 --- a/benchmark/text-generation-inference/performance/benchmark.sh +++ b/benchmark/text-generation-inference/performance/benchmark.sh @@ -7,9 +7,13 @@ output_path="${model//\//_}#${date_str}_guidellm_report.json" export HF_TOKEN=$(cat ~/.cache/huggingface/token) +export GUIDELLM__NUM_SWEEP_PROFILES=1 +export GUIDELLM__MAX_CONCURRENCY=128 +export GUIDELLM__REQUEST_TIMEOUT=60 + guidellm \ --target "http://localhost:8080/v1" \ --model ${model} \ --data-type emulated \ --data "prompt_tokens=1500,prompt_tokens_variance=150,generated_tokens=250,generated_tokens_variance=20" \ - --output-path ${output_path} + --output-path ${output_path} \ From d4084a32d4ce64e873e50f3c221107e6823b945f Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 6 Jan 2025 16:40:02 +0000 Subject: [PATCH 2/5] fix(generation): remove base transformers warpers The logits warpers used for sampling are now included in the list of processors obtained when calling GenerationMixin._get_logits_processor. We need to remove them explicitly because we use instead a fused logits warper (which is 10x faster). --- optimum/neuron/generation/token_selector.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py index 6edd4fd1c..8e9f04349 100644 --- a/optimum/neuron/generation/token_selector.py +++ b/optimum/neuron/generation/token_selector.py @@ -8,6 +8,9 @@ GenerationMixin, LogitsProcessorList, StoppingCriteriaList, + TemperatureLogitsWarper, + TopKLogitsWarper, + TopPLogitsWarper, ) from transformers.generation.utils import GenerationMode @@ -154,6 +157,15 @@ def create( logits_warper = None if generation_mode == GenerationMode.SAMPLE: + # Remove transformers TopK, TopP and Temperature processors + logits_processor = LogitsProcessorList( + [ + p + for p in logits_processor + if not isinstance(p, (TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper)) + ] + ) + # We use a fused logits warper instead logits_warper = FusedLogitsWarper.from_config(generation_config) return cls( From ad12b33c52aedaee3c55988fb1aa7f64c6c44b81 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 7 Jan 2025 08:55:34 +0000 Subject: [PATCH 3/5] Revert "test(tgi): adjust sampling expectations" This reverts commit a718411784046ac47d57b3dff6a655faa8d7ef09. --- .../tests/integration/test_generate.py | 4 ++-- text-generation-inference/tests/server/test_decode.py | 8 ++++---- text-generation-inference/tests/server/test_prefill.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/text-generation-inference/tests/integration/test_generate.py b/text-generation-inference/tests/integration/test_generate.py index 75c064a38..db716be57 100644 --- a/text-generation-inference/tests/integration/test_generate.py +++ b/text-generation-inference/tests/integration/test_generate.py @@ -48,8 +48,8 @@ async def test_model_single_request(tgi_service): ) sample_expectations = { "gpt2": "Deep Learning", - "llama": "Deep learning", - "mistral": "Deep Learning", + "llama": "Deep Learning", + "mistral": "Deep learning", "qwen2": "Deep Learning", "granite": "Deep learning", } diff --git a/text-generation-inference/tests/server/test_decode.py b/text-generation-inference/tests/server/test_decode.py index 5bfc6ca97..1303594db 100644 --- a/text-generation-inference/tests/server/test_decode.py +++ b/text-generation-inference/tests/server/test_decode.py @@ -36,10 +36,10 @@ def _test_decode(config_name, generator, do_sample): assert output.finish_reason == 0 if do_sample: expected_text = { - "gpt2": " the wind was blowing", - "llama": "George Orwell", - "mistral": "The sky is black", - "qwen2": " I stood in the back yard", + "gpt2": " The sun was set", + "llama": "George Orwell, 1984", + "mistral": "The sky was", + "qwen2": " A young woman with", "granite": "Aldous Huxley, Brave New World", }[config_name] assert expected_text in output.text diff --git a/text-generation-inference/tests/server/test_prefill.py b/text-generation-inference/tests/server/test_prefill.py index 7214a6b6a..90634802c 100644 --- a/text-generation-inference/tests/server/test_prefill.py +++ b/text-generation-inference/tests/server/test_prefill.py @@ -35,10 +35,10 @@ def _test_prefill(config_name, generator, batch_size, do_sample): assert len(generations) == batch_size if do_sample: expectations = { - "gpt2": [632, " It"], + "gpt2": [383, " The"], "llama": [10058, " George"], "mistral": [450, " The"], - "qwen2": [358, " I"], + "qwen2": [362, " A"], "granite": [429, " -"], }[config_name] else: From d5b7157817de8d4893e9a43f0750f6e7eb3eb573 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Tue, 7 Jan 2025 10:12:15 +0000 Subject: [PATCH 4/5] test(tgi): adjust granite sampling expectations --- text-generation-inference/tests/server/test_decode.py | 2 +- text-generation-inference/tests/server/test_prefill.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/text-generation-inference/tests/server/test_decode.py b/text-generation-inference/tests/server/test_decode.py index 1303594db..2ab4c2da0 100644 --- a/text-generation-inference/tests/server/test_decode.py +++ b/text-generation-inference/tests/server/test_decode.py @@ -40,7 +40,7 @@ def _test_decode(config_name, generator, do_sample): "llama": "George Orwell, 1984", "mistral": "The sky was", "qwen2": " A young woman with", - "granite": "Aldous Huxley, Brave New World", + "granite": "1984, George Orwell", }[config_name] assert expected_text in output.text else: diff --git a/text-generation-inference/tests/server/test_prefill.py b/text-generation-inference/tests/server/test_prefill.py index 90634802c..2120e5c59 100644 --- a/text-generation-inference/tests/server/test_prefill.py +++ b/text-generation-inference/tests/server/test_prefill.py @@ -39,7 +39,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample): "llama": [10058, " George"], "mistral": [450, " The"], "qwen2": [362, " A"], - "granite": [429, " -"], + "granite": [308, " ("], }[config_name] else: expectations = { From 31bd1ade7de6b09cdc2c1346c4a3495c5e2ae417 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 2 Jan 2025 16:01:37 +0000 Subject: [PATCH 5/5] refactor(trainer): adapt to new compute_loss signature --- optimum/neuron/trainers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py index 6d9f25348..6f5f04afb 100755 --- a/optimum/neuron/trainers.py +++ b/optimum/neuron/trainers.py @@ -401,14 +401,14 @@ def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[s self._update_input_specs_in_model_cache_entry(input_specs_for_cache_entry) return inputs - def compute_loss(self, model, inputs, return_outputs: bool = False): + def compute_loss(self, model, inputs, num_items_in_batch): from neuronx_distributed.pipeline import NxDPPModel if isinstance(model, NxDPPModel): inputs = self._prepare_inputs(inputs) loss = model.run_train(**inputs) else: - loss = super().compute_loss(model, inputs, return_outputs=return_outputs) + loss = super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) return loss def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):