Skip to content

Commit

Permalink
Improve llama models performance (#587)
Browse files Browse the repository at this point in the history
* fix(bench): allow launch from top directory

* feat(decoder): add attention layout to exporter

* feat(decoder): set attention_layout to BSH for llama

This reduces encoding (prefill) time.

* fix(bench): make sure max_new_tokens are generated

* perf(llama3): add generation benchmark

* docs(benchmark): update Mistral results

* docs(benchmark): new Llama2-7b results with 8 cores

* perf(tgi): update Llama-2-7b results

* docs(benchmark): update Llama13B results on 8 cores

* feat(tgi): bump router version to 2.0.2

* Update docs/source/_toctree.yml

Co-authored-by: Michael Benayoun <[email protected]>

---------

Co-authored-by: Michael Benayoun <[email protected]>
  • Loading branch information
dacorvo and michaelbenayoun authored May 3, 2024
1 parent 18460aa commit 707fff4
Show file tree
Hide file tree
Showing 26 changed files with 143 additions and 29 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES) \
$(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
python -m build

TGI_VERSION ?= 2.0.1
TGI_VERSION ?= 2.0.2

neuronx-tgi: $(PACKAGE_DIST)
docker build --rm -f text-generation-inference/Dockerfile \
Expand Down
20 changes: 10 additions & 10 deletions benchmark/text-generation-inference/llama-7b/tgi-results.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.84493535907894,0.435653425001874,70.64353697527179
huggingface/NousResearch/Llama-2-7b-chat-hf,2,25.213946432976638,0.4359589194991713,70.55276283551507
huggingface/NousResearch/Llama-2-7b-chat-hf,4,43.26619632041904,0.43764654000005976,71.40762554352298
huggingface/NousResearch/Llama-2-7b-chat-hf,8,81.7002047989417,0.46597404000203824,74.66130438229308
huggingface/NousResearch/Llama-2-7b-chat-hf,16,148.73365777295837,0.8807341205010744,79.46121462672393
huggingface/NousResearch/Llama-2-7b-chat-hf,32,241.07605116636378,2.58607812900118,91.31557495460669
huggingface/NousResearch/Llama-2-7b-chat-hf,64,338.6319898631105,6.262418706501194,118.3833058616551
huggingface/NousResearch/Llama-2-7b-chat-hf,128,410.3968188304912,12.920248634000018,167.830813359929
huggingface/NousResearch/Llama-2-7b-chat-hf,256,478.76738958996015,29.621474924000722,257.6998219293685
huggingface/NousResearch/Llama-2-7b-chat-hf,512,496.5535875105274,44.485632503998204,329.7294857727593
huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.811941495564616,0.3781782309997652,71.37198062194233
huggingface/NousResearch/Llama-2-7b-chat-hf,2,23.461539426271507,0.3602376449998701,71.70553820509232
huggingface/NousResearch/Llama-2-7b-chat-hf,4,45.45448705790145,0.3612828944997091,73.58663426819392
huggingface/NousResearch/Llama-2-7b-chat-hf,8,71.13444471932405,0.3752646894999998,74.85884378373552
huggingface/NousResearch/Llama-2-7b-chat-hf,16,138.54599491404485,0.6447374934998606,81.11484812939682
huggingface/NousResearch/Llama-2-7b-chat-hf,32,247.32811870027916,1.0393478490004782,85.0958261705239
huggingface/NousResearch/Llama-2-7b-chat-hf,64,391.3595246354876,2.2831421710016,99.36474989676213
huggingface/NousResearch/Llama-2-7b-chat-hf,128,464.82600069905294,3.342431744500118,120.29151899306808
huggingface/NousResearch/Llama-2-7b-chat-hf,256,526.7164477974997,6.532527566999306,160.52458146930456
huggingface/NousResearch/Llama-2-7b-chat-hf,512,506.7975712115936,27.33909000099993,260.14547684970137
11 changes: 6 additions & 5 deletions benchmark/text-generation/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@
from optimum.neuron import NeuronModelForCausalLM


def generate(model, input_ids, max_new_tokens):
def generate(model, input_ids, output_length):
start = time.time()
with torch.inference_mode():
output_tokens = model.generate(input_ids, do_sample=False, max_new_tokens=max_new_tokens)
output_tokens = model.generate(input_ids, do_sample=False, min_length=output_length, max_length=output_length)
end = time.time()
return output_tokens, (end - start)


def run(model_id, inc_length, max_length, json_path=None):
# Encode the reference prompt
with open("./wiki.txt") as f:
local_path = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(local_path, "wiki.txt")) as f:
prompt = f.read()
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokens = tokenizer([prompt], return_tensors="pt")
Expand All @@ -35,10 +36,10 @@ def get_input_ids(tokens, batch_size, input_length):
benchmark = {"neuron_config": neuron_config, "results": []}
for input_length in range(inc_length, max_length - inc_length + 1, inc_length):
# Generate a single input, just to evaluate the context encoding time
input_ids = get_input_ids(tokens, batch_size, input_length)
input_ids = get_input_ids(tokens, batch_size, input_length + 1)
_, encoding_time = generate(model, input_ids, 1)
new_tokens = inc_length
output_ids, duration = generate(model, input_ids, new_tokens)
output_ids, duration = generate(model, input_ids, input_length + new_tokens)
latency = (duration - encoding_time) / new_tokens * 1000
throughput = new_tokens * batch_size / duration
result = {
Expand Down
3 changes: 1 addition & 2 deletions benchmark/text-generation/llama2-13b.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def main():
NUM_CORES = 24
NUM_CORES = 8
num_cores = get_available_cores()
if num_cores < NUM_CORES:
raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
Expand All @@ -18,7 +18,6 @@ def main():
"Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096],
"Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096],
"Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096],
"Llama-2-13B-BS32": ["meta-llama/Llama-2-13b-chat-hf", 32, 4096],
}

for model_name, model_configuration in model_configurations.items():
Expand Down
2 changes: 1 addition & 1 deletion benchmark/text-generation/llama2-7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def main():
NUM_CORES = 24
NUM_CORES = 8
num_cores = get_available_cores()
if num_cores < NUM_CORES:
raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
Expand Down
43 changes: 43 additions & 0 deletions benchmark/text-generation/llama3-8b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from tempfile import TemporaryDirectory

from transformers import AutoTokenizer

from benchmark import run
from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron.modeling_decoder import get_available_cores


def main():
NUM_CORES = 8
num_cores = get_available_cores()
if num_cores < NUM_CORES:
raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")

model_configurations = {
"Llama-3-8B-BS1": ["meta-llama/Meta-Llama-3-8B", 1, 4096],
"Llama-3-8B-BS4": ["meta-llama/Meta-Llama-3-8B", 4, 4096],
"Llama-3-8B-BS8": ["meta-llama/Meta-Llama-3-8B", 8, 4096],
"Llama-3-8B-BS16": ["meta-llama/Meta-Llama-3-8B", 16, 4096],
"Llama-3-8B-BS32": ["meta-llama/Meta-Llama-3-8B", 32, 4096],
}

for model_name, model_configuration in model_configurations.items():
model_id, batch_size, seq_length = model_configuration
model = NeuronModelForCausalLM.from_pretrained(
model_id,
export=True,
batch_size=batch_size,
sequence_length=seq_length,
auto_cast_type="fp16",
num_cores=NUM_CORES,
)
with TemporaryDirectory() as tmpdir:
model.save_pretrained(tmpdir)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(tmpdir)
json_path = f"{model_name}.json"
run(tmpdir, 256, 2048, json_path=json_path)


if __name__ == "__main__":
main()
Binary file modified docs/assets/benchmarks/inferentia-llama2-13b/latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-llama2-13b/throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-llama2-13b/ttft.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-llama2-7b/latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-llama2-7b/throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-llama2-7b/ttft.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-mistral-v2/latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-mistral-v2/throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/assets/benchmarks/inferentia-mistral-v2/ttft.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
title: Llama2 13b on AWS Inferentia2
- local: benchmarks/inferentia-mistral-v2
title: Mistral v0.2 7b on AWS Inferentia2
- local: benchmarks/inferentia-llama3-8b
title: Llama-3 8B on AWS Inferentia2
title: Benchmarks
- sections:
- local: community/contributing
Expand Down
11 changes: 5 additions & 6 deletions docs/source/benchmarks/inferentia-llama2-13b.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,12 @@ For this benchmark we will use the following configurations:

| Model type | batch_size | sequence_length |
|-----------------|------------|-----------------|
| Llama2 13b BS1 | 1 | 4096 |
| Llama2 13b BS4 | 4 | 4096 |
| Llama2 13b BS8 | 8 | 4096 |
| Llama2 13b BS16 | 16 | 4096 |
| Llama2 13b BS32 | 32 | 4096 |
| Llama2 13B BS1 | 1 | 4096 |
| Llama2 13B BS4 | 4 | 4096 |
| Llama2 13B BS8 | 8 | 4096 |
| Llama2 13B BS16 | 16 | 4096 |

*Note: all models are compiled to use all 12 devices corresponding to 24 cores on the `inf2.48xlarge` instance.*
*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*

*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*

Expand Down
2 changes: 1 addition & 1 deletion docs/source/benchmarks/inferentia-llama2-7b.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ For this benchmark we will use the following configurations:
| Llama2 7B BS16 | 16 | 4096 |
| Llama2 7B BS32 | 32 | 4096 |

*Note: all models are compiled to use all 12 devices corresponding to 24 cores on the `inf2.48xlarge` instance.*
*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*

*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*

Expand Down
61 changes: 61 additions & 0 deletions docs/source/benchmarks/inferentia-llama3-8b.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<!---
Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# Llama-3-8b performance on AWS Inferentia2 (Latency & Througput)

How fast is Llama-3-8b on Inferentia2? Let's figure out!

For this benchmark we will use the following configurations:

| Model type | batch_size | sequence_length |
|----------------|------------|-----------------|
| Llama3 8b BS1 | 1 | 4096 |
| Llama3 8b BS4 | 4 | 4096 |
| Llama3 8b BS8 | 8 | 4096 |
| Llama3 8b BS16 | 16 | 4096 |
| Llama3 8b BS32 | 32 | 4096 |

*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*

*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*

## Time to first token

The time to first token is the time required to process the input tokens and generate the first output token.
It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.

We test the time to first token for increasing context sizes, from a typical Q/A usage, to heavy Retrieval Augmented Generation (RAG) use-cases.

Time to first token is expressed in **seconds**.

![Llama3 8b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png "Time to first token")

## Inter-token Latency

The inter-token latency corresponds to the average time elapsed between two generated tokens.

It is expressed in **milliseconds**.

![Llama3 8b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/latency.png "Inter-token latency")

### Throughput

Unlike some other benchmarks, we evaluate the throughput using generated tokens only, by dividing their number
by the end-to-end latency.

Throughput is expressed in **tokens/second**.

![Llama3 8b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png "Throughput")
8 changes: 7 additions & 1 deletion optimum/exporters/neuron/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,8 @@ class NeuronDecoderConfig(NeuronConfig):
be passed to export the model,
- NEURONX_CLASS (`str`) -- the name of the transformers-neuronx class to instantiate for the model.
It is a full class name defined relatively to the transformers-neuronx module, e.g. `gpt2.model.GPT2ForSampling`
- CONTINUOUS_BATCHING (`bool`, , defaults to `False`) -- Whether the model supports continuous batching or not.
- CONTINUOUS_BATCHING (`bool`, defaults to `False`) -- Whether the model supports continuous batching or not.
- ATTENTION_LAYOUT (`str`, defaults to `HSB`) -- Layout to be used for attention computation.
The NEURONX_CLASS must always be defined in each model configuration.
Expand All @@ -398,6 +399,7 @@ class NeuronDecoderConfig(NeuronConfig):
INPUT_ARGS = ("batch_size", "sequence_length")
NEURONX_CLASS = None
CONTINUOUS_BATCHING = False
ATTENTION_lAYOUT = "HSB"

def __init__(self, task: str):
if not is_transformers_neuronx_available():
Expand All @@ -417,3 +419,7 @@ def neuronx_class(self):
@property
def continuous_batching(self):
return self.CONTINUOUS_BATCHING

@property
def attention_layout(self):
return self.ATTENTION_lAYOUT
1 change: 1 addition & 0 deletions optimum/exporters/neuron/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ class GPT2NeuronConfig(TextNeuronDecoderConfig):
class LLamaNeuronConfig(TextNeuronDecoderConfig):
NEURONX_CLASS = "llama.model.LlamaForSampling"
CONTINUOUS_BATCHING = True
ATTENTION_lAYOUT = "BSH"


@register_in_tasks_manager("t5-encoder", "text2text-generation")
Expand Down
4 changes: 3 additions & 1 deletion optimum/neuron/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,13 @@ def __init__(
# Continuous batching is always enabled for models that support it because static batching
# is broken for these models: see https://github.com/aws-neuron/transformers-neuronx/issues/79
tnx_kwargs["neuron_config"] = NeuronConfig(
continuous_batching=ContinuousBatchingConfig(batch_size_for_shared_caches=batch_size)
continuous_batching=ContinuousBatchingConfig(batch_size_for_shared_caches=batch_size),
attention_layout=exporter.attention_layout,
)
tnx_kwargs["n_positions"] = [sequence_length]
tnx_kwargs["context_length_estimate"] = [sequence_length]
else:
tnx_kwargs["neuron_config"] = NeuronConfig(attention_layout=exporter.attention_layout)
tnx_kwargs["n_positions"] = sequence_length

# Instantiate neuronx model
Expand Down
2 changes: 1 addition & 1 deletion text-generation-inference/server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
pkg_name := text_generation_server
BUILDDIR ?= $(CURDIR)/build
VERSION ?= 0.0.1
TGI_VERSION ?= 2.0.1
TGI_VERSION ?= 2.0.2
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
mkfile_dir := $(dir $(mkfile_path))
pkg_dir := $(BUILDDIR)/$(pkg_name)
Expand Down

0 comments on commit 707fff4

Please sign in to comment.