diff --git a/benchmark/text-generation-inference/performance/benchmark.sh b/benchmark/text-generation-inference/performance/benchmark.sh index 2e3a1f60e..f510f64f3 100755 --- a/benchmark/text-generation-inference/performance/benchmark.sh +++ b/benchmark/text-generation-inference/performance/benchmark.sh @@ -7,9 +7,13 @@ output_path="${model//\//_}#${date_str}_guidellm_report.json" export HF_TOKEN=$(cat ~/.cache/huggingface/token) +export GUIDELLM__NUM_SWEEP_PROFILES=1 +export GUIDELLM__MAX_CONCURRENCY=128 +export GUIDELLM__REQUEST_TIMEOUT=60 + guidellm \ --target "http://localhost:8080/v1" \ --model ${model} \ --data-type emulated \ --data "prompt_tokens=1500,prompt_tokens_variance=150,generated_tokens=250,generated_tokens_variance=20" \ - --output-path ${output_path} + --output-path ${output_path} \ diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py index 6edd4fd1c..8e9f04349 100644 --- a/optimum/neuron/generation/token_selector.py +++ b/optimum/neuron/generation/token_selector.py @@ -8,6 +8,9 @@ GenerationMixin, LogitsProcessorList, StoppingCriteriaList, + TemperatureLogitsWarper, + TopKLogitsWarper, + TopPLogitsWarper, ) from transformers.generation.utils import GenerationMode @@ -154,6 +157,15 @@ def create( logits_warper = None if generation_mode == GenerationMode.SAMPLE: + # Remove transformers TopK, TopP and Temperature processors + logits_processor = LogitsProcessorList( + [ + p + for p in logits_processor + if not isinstance(p, (TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper)) + ] + ) + # We use a fused logits warper instead logits_warper = FusedLogitsWarper.from_config(generation_config) return cls( diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py index 6d9f25348..6f5f04afb 100755 --- a/optimum/neuron/trainers.py +++ b/optimum/neuron/trainers.py @@ -401,14 +401,14 @@ def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[s self._update_input_specs_in_model_cache_entry(input_specs_for_cache_entry) return inputs - def compute_loss(self, model, inputs, return_outputs: bool = False): + def compute_loss(self, model, inputs, num_items_in_batch): from neuronx_distributed.pipeline import NxDPPModel if isinstance(model, NxDPPModel): inputs = self._prepare_inputs(inputs) loss = model.run_train(**inputs) else: - loss = super().compute_loss(model, inputs, return_outputs=return_outputs) + loss = super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) return loss def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True): diff --git a/text-generation-inference/tests/integration/test_generate.py b/text-generation-inference/tests/integration/test_generate.py index 75c064a38..db716be57 100644 --- a/text-generation-inference/tests/integration/test_generate.py +++ b/text-generation-inference/tests/integration/test_generate.py @@ -48,8 +48,8 @@ async def test_model_single_request(tgi_service): ) sample_expectations = { "gpt2": "Deep Learning", - "llama": "Deep learning", - "mistral": "Deep Learning", + "llama": "Deep Learning", + "mistral": "Deep learning", "qwen2": "Deep Learning", "granite": "Deep learning", } diff --git a/text-generation-inference/tests/server/test_decode.py b/text-generation-inference/tests/server/test_decode.py index 5bfc6ca97..2ab4c2da0 100644 --- a/text-generation-inference/tests/server/test_decode.py +++ b/text-generation-inference/tests/server/test_decode.py @@ -36,11 +36,11 @@ def _test_decode(config_name, generator, do_sample): assert output.finish_reason == 0 if do_sample: expected_text = { - "gpt2": " the wind was blowing", - "llama": "George Orwell", - "mistral": "The sky is black", - "qwen2": " I stood in the back yard", - "granite": "Aldous Huxley, Brave New World", + "gpt2": " The sun was set", + "llama": "George Orwell, 1984", + "mistral": "The sky was", + "qwen2": " A young woman with", + "granite": "1984, George Orwell", }[config_name] assert expected_text in output.text else: diff --git a/text-generation-inference/tests/server/test_prefill.py b/text-generation-inference/tests/server/test_prefill.py index 7214a6b6a..2120e5c59 100644 --- a/text-generation-inference/tests/server/test_prefill.py +++ b/text-generation-inference/tests/server/test_prefill.py @@ -35,11 +35,11 @@ def _test_prefill(config_name, generator, batch_size, do_sample): assert len(generations) == batch_size if do_sample: expectations = { - "gpt2": [632, " It"], + "gpt2": [383, " The"], "llama": [10058, " George"], "mistral": [450, " The"], - "qwen2": [358, " I"], - "granite": [429, " -"], + "qwen2": [362, " A"], + "granite": [308, " ("], }[config_name] else: expectations = {