Improve llama models performance (#587)

* fix(bench): allow launch from top directory * feat(decoder): add attention layout to exporter * feat(decoder): set attention_layout to BSH for llama This reduces encoding (prefill) time. * fix(bench): make sure max_new_tokens are generated * perf(llama3): add generation benchmark * docs(benchmark): update Mistral results * docs(benchmark): new Llama2-7b results with 8 cores * perf(tgi): update Llama-2-7b results * docs(benchmark): update Llama13B results on 8 cores * feat(tgi): bump router version to 2.0.2 * Update docs/source/_toctree.yml Co-authored-by: Michael Benayoun <[email protected]> --------- Co-authored-by: Michael Benayoun <[email protected]>
huggingface · May 3, 2024 · 707fff4 · 707fff4
1 parent 18460aa
commit 707fff4
Show file tree

Hide file tree

Showing 26 changed files with 143 additions and 29 deletions.
diff --git a/Makefile b/Makefile
@@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
-TGI_VERSION ?= 2.0.1
+TGI_VERSION ?= 2.0.2
 
 neuronx-tgi: $(PACKAGE_DIST)
 	docker build --rm -f text-generation-inference/Dockerfile \

diff --git a/benchmark/text-generation-inference/llama-7b/tgi-results.csv b/benchmark/text-generation-inference/llama-7b/tgi-results.csv
@@ -1,11 +1,11 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.84493535907894,0.435653425001874,70.64353697527179
-huggingface/NousResearch/Llama-2-7b-chat-hf,2,25.213946432976638,0.4359589194991713,70.55276283551507
-huggingface/NousResearch/Llama-2-7b-chat-hf,4,43.26619632041904,0.43764654000005976,71.40762554352298
-huggingface/NousResearch/Llama-2-7b-chat-hf,8,81.7002047989417,0.46597404000203824,74.66130438229308
-huggingface/NousResearch/Llama-2-7b-chat-hf,16,148.73365777295837,0.8807341205010744,79.46121462672393
-huggingface/NousResearch/Llama-2-7b-chat-hf,32,241.07605116636378,2.58607812900118,91.31557495460669
-huggingface/NousResearch/Llama-2-7b-chat-hf,64,338.6319898631105,6.262418706501194,118.3833058616551
-huggingface/NousResearch/Llama-2-7b-chat-hf,128,410.3968188304912,12.920248634000018,167.830813359929
-huggingface/NousResearch/Llama-2-7b-chat-hf,256,478.76738958996015,29.621474924000722,257.6998219293685
-huggingface/NousResearch/Llama-2-7b-chat-hf,512,496.5535875105274,44.485632503998204,329.7294857727593
+huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.811941495564616,0.3781782309997652,71.37198062194233
+huggingface/NousResearch/Llama-2-7b-chat-hf,2,23.461539426271507,0.3602376449998701,71.70553820509232
+huggingface/NousResearch/Llama-2-7b-chat-hf,4,45.45448705790145,0.3612828944997091,73.58663426819392
+huggingface/NousResearch/Llama-2-7b-chat-hf,8,71.13444471932405,0.3752646894999998,74.85884378373552
+huggingface/NousResearch/Llama-2-7b-chat-hf,16,138.54599491404485,0.6447374934998606,81.11484812939682
+huggingface/NousResearch/Llama-2-7b-chat-hf,32,247.32811870027916,1.0393478490004782,85.0958261705239
+huggingface/NousResearch/Llama-2-7b-chat-hf,64,391.3595246354876,2.2831421710016,99.36474989676213
+huggingface/NousResearch/Llama-2-7b-chat-hf,128,464.82600069905294,3.342431744500118,120.29151899306808
+huggingface/NousResearch/Llama-2-7b-chat-hf,256,526.7164477974997,6.532527566999306,160.52458146930456
+huggingface/NousResearch/Llama-2-7b-chat-hf,512,506.7975712115936,27.33909000099993,260.14547684970137
diff --git a/benchmark/text-generation/benchmark.py b/benchmark/text-generation/benchmark.py
@@ -9,17 +9,18 @@
 from optimum.neuron import NeuronModelForCausalLM
 
 
-def generate(model, input_ids, max_new_tokens):
+def generate(model, input_ids, output_length):
     start = time.time()
     with torch.inference_mode():
-        output_tokens = model.generate(input_ids, do_sample=False, max_new_tokens=max_new_tokens)
+        output_tokens = model.generate(input_ids, do_sample=False, min_length=output_length, max_length=output_length)
     end = time.time()
     return output_tokens, (end - start)
 
 
 def run(model_id, inc_length, max_length, json_path=None):
     # Encode the reference prompt
-    with open("./wiki.txt") as f:
+    local_path = os.path.dirname(os.path.realpath(__file__))
+    with open(os.path.join(local_path, "wiki.txt")) as f:
         prompt = f.read()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokens = tokenizer([prompt], return_tensors="pt")
@@ -35,10 +36,10 @@ def get_input_ids(tokens, batch_size, input_length):
     benchmark = {"neuron_config": neuron_config, "results": []}
     for input_length in range(inc_length, max_length - inc_length + 1, inc_length):
         # Generate a single input, just to evaluate the context encoding time
-        input_ids = get_input_ids(tokens, batch_size, input_length)
+        input_ids = get_input_ids(tokens, batch_size, input_length + 1)
         _, encoding_time = generate(model, input_ids, 1)
         new_tokens = inc_length
-        output_ids, duration = generate(model, input_ids, new_tokens)
+        output_ids, duration = generate(model, input_ids, input_length + new_tokens)
         latency = (duration - encoding_time) / new_tokens * 1000
         throughput = new_tokens * batch_size / duration
         result = {

diff --git a/benchmark/text-generation/llama2-13b.py b/benchmark/text-generation/llama2-13b.py
@@ -8,7 +8,7 @@
 
 
 def main():
-    NUM_CORES = 24
+    NUM_CORES = 8
     num_cores = get_available_cores()
     if num_cores < NUM_CORES:
         raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
@@ -18,7 +18,6 @@ def main():
         "Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096],
         "Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096],
         "Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096],
-        "Llama-2-13B-BS32": ["meta-llama/Llama-2-13b-chat-hf", 32, 4096],
     }
 
     for model_name, model_configuration in model_configurations.items():

diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py
@@ -8,7 +8,7 @@
 
 
 def main():
-    NUM_CORES = 24
+    NUM_CORES = 8
     num_cores = get_available_cores()
     if num_cores < NUM_CORES:
         raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")

diff --git a/benchmark/text-generation/llama3-8b.py b/benchmark/text-generation/llama3-8b.py
@@ -0,0 +1,43 @@
+from tempfile import TemporaryDirectory
+
+from transformers import AutoTokenizer
+
+from benchmark import run
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.modeling_decoder import get_available_cores
+
+
+def main():
+    NUM_CORES = 8
+    num_cores = get_available_cores()
+    if num_cores < NUM_CORES:
+        raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
+
+    model_configurations = {
+        "Llama-3-8B-BS1": ["meta-llama/Meta-Llama-3-8B", 1, 4096],
+        "Llama-3-8B-BS4": ["meta-llama/Meta-Llama-3-8B", 4, 4096],
+        "Llama-3-8B-BS8": ["meta-llama/Meta-Llama-3-8B", 8, 4096],
+        "Llama-3-8B-BS16": ["meta-llama/Meta-Llama-3-8B", 16, 4096],
+        "Llama-3-8B-BS32": ["meta-llama/Meta-Llama-3-8B", 32, 4096],
+    }
+
+    for model_name, model_configuration in model_configurations.items():
+        model_id, batch_size, seq_length = model_configuration
+        model = NeuronModelForCausalLM.from_pretrained(
+            model_id,
+            export=True,
+            batch_size=batch_size,
+            sequence_length=seq_length,
+            auto_cast_type="fp16",
+            num_cores=NUM_CORES,
+        )
+        with TemporaryDirectory() as tmpdir:
+            model.save_pretrained(tmpdir)
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            tokenizer.save_pretrained(tmpdir)
+            json_path = f"{model_name}.json"
+            run(tmpdir, 256, 2048, json_path=json_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/latency.png b/docs/assets/benchmarks/inferentia-llama2-13b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/latency.png b/docs/assets/benchmarks/inferentia-mistral-v2/latency.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png b/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png b/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -50,6 +50,8 @@
       title: Llama2 13b on AWS Inferentia2
     - local: benchmarks/inferentia-mistral-v2
       title: Mistral v0.2 7b on AWS Inferentia2
+    - local: benchmarks/inferentia-llama3-8b
+      title: Llama-3 8B on AWS Inferentia2
     title: Benchmarks
   - sections:
     - local: community/contributing

diff --git a/docs/source/benchmarks/inferentia-llama2-13b.mdx b/docs/source/benchmarks/inferentia-llama2-13b.mdx
@@ -22,13 +22,12 @@ For this benchmark we will use the following configurations:
 
 | Model type      | batch_size | sequence_length |
 |-----------------|------------|-----------------|
-| Llama2 13b BS1  | 1          | 4096            |
-| Llama2 13b BS4  | 4          | 4096            |
-| Llama2 13b BS8  | 8          | 4096            |
-| Llama2 13b BS16 | 16         | 4096            |
-| Llama2 13b BS32 | 32         | 4096            |
+| Llama2 13B BS1  | 1          | 4096            |
+| Llama2 13B BS4  | 4          | 4096            |
+| Llama2 13B BS8  | 8          | 4096            |
+| Llama2 13B BS16 | 16         | 4096            |
 
-*Note: all models are compiled to use all 12 devices corresponding to 24 cores on the `inf2.48xlarge` instance.*
+*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
 
 *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
 

diff --git a/docs/source/benchmarks/inferentia-llama2-7b.mdx b/docs/source/benchmarks/inferentia-llama2-7b.mdx
@@ -28,7 +28,7 @@ For this benchmark we will use the following configurations:
 | Llama2 7B BS16 | 16         | 4096            |
 | Llama2 7B BS32 | 32         | 4096            |
 
-*Note: all models are compiled to use all 12 devices corresponding to 24 cores on the `inf2.48xlarge` instance.*
+*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
 
 *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
 

diff --git a/docs/source/benchmarks/inferentia-llama3-8b.mdx b/docs/source/benchmarks/inferentia-llama3-8b.mdx
@@ -0,0 +1,61 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Llama-3-8b performance on AWS Inferentia2 (Latency & Througput)
+
+How fast is Llama-3-8b on Inferentia2?  Let's figure out!
+
+For this benchmark we will use the following configurations:
+
+| Model type     | batch_size | sequence_length |
+|----------------|------------|-----------------|
+| Llama3 8b BS1  | 1          | 4096            |
+| Llama3 8b BS4  | 4          | 4096            |
+| Llama3 8b BS8  | 8          | 4096            |
+| Llama3 8b BS16 | 16         | 4096            |
+| Llama3 8b BS32 | 32         | 4096            |
+
+*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
+
+*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
+
+## Time to first token
+
+The time to first token is the time required to process the input tokens and generate the first output token.
+It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.
+
+We test the time to first token for increasing context sizes, from a typical Q/A usage, to heavy Retrieval Augmented Generation (RAG) use-cases.
+
+Time to first token is expressed in **seconds**.
+
+![Llama3 8b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png "Time to first token")
+
+## Inter-token Latency
+
+The inter-token latency corresponds to the average time elapsed between two generated tokens.
+
+It is expressed in **milliseconds**.
+
+![Llama3 8b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/latency.png "Inter-token latency")
+
+### Throughput
+
+Unlike some other benchmarks, we evaluate the throughput using generated tokens only, by dividing their number
+by the end-to-end latency.
+
+Throughput is expressed in **tokens/second**.
+
+![Llama3 8b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png "Throughput")
diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
@@ -387,7 +387,8 @@ class NeuronDecoderConfig(NeuronConfig):
         be passed to export the model,
     - NEURONX_CLASS (`str`) -- the name of the transformers-neuronx class to instantiate for the model.
     It is a full class name defined relatively to the transformers-neuronx module, e.g. `gpt2.model.GPT2ForSampling`
-    - CONTINUOUS_BATCHING (`bool`, , defaults to `False`) -- Whether the model supports continuous batching or not.
+    - CONTINUOUS_BATCHING (`bool`, defaults to `False`) -- Whether the model supports continuous batching or not.
+    - ATTENTION_LAYOUT (`str`, defaults to `HSB`) -- Layout to be used for attention computation.
 
     The NEURONX_CLASS must always be defined in each model configuration.
 
@@ -398,6 +399,7 @@ class NeuronDecoderConfig(NeuronConfig):
     INPUT_ARGS = ("batch_size", "sequence_length")
     NEURONX_CLASS = None
     CONTINUOUS_BATCHING = False
+    ATTENTION_lAYOUT = "HSB"
 
     def __init__(self, task: str):
         if not is_transformers_neuronx_available():
@@ -417,3 +419,7 @@ def neuronx_class(self):
     @property
     def continuous_batching(self):
         return self.CONTINUOUS_BATCHING
+
+    @property
+    def attention_layout(self):
+        return self.ATTENTION_lAYOUT
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
@@ -461,6 +461,7 @@ class GPT2NeuronConfig(TextNeuronDecoderConfig):
 class LLamaNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "llama.model.LlamaForSampling"
     CONTINUOUS_BATCHING = True
+    ATTENTION_lAYOUT = "BSH"
 
 
 @register_in_tasks_manager("t5-encoder", "text2text-generation")

diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py
@@ -179,11 +179,13 @@ def __init__(
             # Continuous batching is always enabled for models that support it because static batching
             # is broken for these models:  see https://github.com/aws-neuron/transformers-neuronx/issues/79
             tnx_kwargs["neuron_config"] = NeuronConfig(
-                continuous_batching=ContinuousBatchingConfig(batch_size_for_shared_caches=batch_size)
+                continuous_batching=ContinuousBatchingConfig(batch_size_for_shared_caches=batch_size),
+                attention_layout=exporter.attention_layout,
             )
             tnx_kwargs["n_positions"] = [sequence_length]
             tnx_kwargs["context_length_estimate"] = [sequence_length]
         else:
+            tnx_kwargs["neuron_config"] = NeuronConfig(attention_layout=exporter.attention_layout)
             tnx_kwargs["n_positions"] = sequence_length
 
         # Instantiate neuronx model

diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
@@ -2,7 +2,7 @@
 pkg_name := text_generation_server
 BUILDDIR ?= $(CURDIR)/build
 VERSION ?= 0.0.1
-TGI_VERSION ?= 2.0.1
+TGI_VERSION ?= 2.0.2
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)