diff --git a/Makefile b/Makefile index c0fa4229e..3fe53135d 100644 --- a/Makefile +++ b/Makefile @@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES) \ $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES) python -m build -TGI_VERSION ?= 2.0.1 +TGI_VERSION ?= 2.0.2 neuronx-tgi: $(PACKAGE_DIST) docker build --rm -f text-generation-inference/Dockerfile \ diff --git a/benchmark/text-generation-inference/llama-7b/tgi-results.csv b/benchmark/text-generation-inference/llama-7b/tgi-results.csv index cc0ebc878..96f382dc9 100644 --- a/benchmark/text-generation-inference/llama-7b/tgi-results.csv +++ b/benchmark/text-generation-inference/llama-7b/tgi-results.csv @@ -1,11 +1,11 @@ model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms) -huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.84493535907894,0.435653425001874,70.64353697527179 -huggingface/NousResearch/Llama-2-7b-chat-hf,2,25.213946432976638,0.4359589194991713,70.55276283551507 -huggingface/NousResearch/Llama-2-7b-chat-hf,4,43.26619632041904,0.43764654000005976,71.40762554352298 -huggingface/NousResearch/Llama-2-7b-chat-hf,8,81.7002047989417,0.46597404000203824,74.66130438229308 -huggingface/NousResearch/Llama-2-7b-chat-hf,16,148.73365777295837,0.8807341205010744,79.46121462672393 -huggingface/NousResearch/Llama-2-7b-chat-hf,32,241.07605116636378,2.58607812900118,91.31557495460669 -huggingface/NousResearch/Llama-2-7b-chat-hf,64,338.6319898631105,6.262418706501194,118.3833058616551 -huggingface/NousResearch/Llama-2-7b-chat-hf,128,410.3968188304912,12.920248634000018,167.830813359929 -huggingface/NousResearch/Llama-2-7b-chat-hf,256,478.76738958996015,29.621474924000722,257.6998219293685 -huggingface/NousResearch/Llama-2-7b-chat-hf,512,496.5535875105274,44.485632503998204,329.7294857727593 +huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.811941495564616,0.3781782309997652,71.37198062194233 +huggingface/NousResearch/Llama-2-7b-chat-hf,2,23.461539426271507,0.3602376449998701,71.70553820509232 +huggingface/NousResearch/Llama-2-7b-chat-hf,4,45.45448705790145,0.3612828944997091,73.58663426819392 +huggingface/NousResearch/Llama-2-7b-chat-hf,8,71.13444471932405,0.3752646894999998,74.85884378373552 +huggingface/NousResearch/Llama-2-7b-chat-hf,16,138.54599491404485,0.6447374934998606,81.11484812939682 +huggingface/NousResearch/Llama-2-7b-chat-hf,32,247.32811870027916,1.0393478490004782,85.0958261705239 +huggingface/NousResearch/Llama-2-7b-chat-hf,64,391.3595246354876,2.2831421710016,99.36474989676213 +huggingface/NousResearch/Llama-2-7b-chat-hf,128,464.82600069905294,3.342431744500118,120.29151899306808 +huggingface/NousResearch/Llama-2-7b-chat-hf,256,526.7164477974997,6.532527566999306,160.52458146930456 +huggingface/NousResearch/Llama-2-7b-chat-hf,512,506.7975712115936,27.33909000099993,260.14547684970137 diff --git a/benchmark/text-generation/benchmark.py b/benchmark/text-generation/benchmark.py index 7b333c629..98234af0c 100644 --- a/benchmark/text-generation/benchmark.py +++ b/benchmark/text-generation/benchmark.py @@ -9,17 +9,18 @@ from optimum.neuron import NeuronModelForCausalLM -def generate(model, input_ids, max_new_tokens): +def generate(model, input_ids, output_length): start = time.time() with torch.inference_mode(): - output_tokens = model.generate(input_ids, do_sample=False, max_new_tokens=max_new_tokens) + output_tokens = model.generate(input_ids, do_sample=False, min_length=output_length, max_length=output_length) end = time.time() return output_tokens, (end - start) def run(model_id, inc_length, max_length, json_path=None): # Encode the reference prompt - with open("./wiki.txt") as f: + local_path = os.path.dirname(os.path.realpath(__file__)) + with open(os.path.join(local_path, "wiki.txt")) as f: prompt = f.read() tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer([prompt], return_tensors="pt") @@ -35,10 +36,10 @@ def get_input_ids(tokens, batch_size, input_length): benchmark = {"neuron_config": neuron_config, "results": []} for input_length in range(inc_length, max_length - inc_length + 1, inc_length): # Generate a single input, just to evaluate the context encoding time - input_ids = get_input_ids(tokens, batch_size, input_length) + input_ids = get_input_ids(tokens, batch_size, input_length + 1) _, encoding_time = generate(model, input_ids, 1) new_tokens = inc_length - output_ids, duration = generate(model, input_ids, new_tokens) + output_ids, duration = generate(model, input_ids, input_length + new_tokens) latency = (duration - encoding_time) / new_tokens * 1000 throughput = new_tokens * batch_size / duration result = { diff --git a/benchmark/text-generation/llama2-13b.py b/benchmark/text-generation/llama2-13b.py index f5a35bdbe..03bcf70c2 100644 --- a/benchmark/text-generation/llama2-13b.py +++ b/benchmark/text-generation/llama2-13b.py @@ -8,7 +8,7 @@ def main(): - NUM_CORES = 24 + NUM_CORES = 8 num_cores = get_available_cores() if num_cores < NUM_CORES: raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.") @@ -18,7 +18,6 @@ def main(): "Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096], "Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096], "Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096], - "Llama-2-13B-BS32": ["meta-llama/Llama-2-13b-chat-hf", 32, 4096], } for model_name, model_configuration in model_configurations.items(): diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py index 1eadfe684..754475c45 100644 --- a/benchmark/text-generation/llama2-7b.py +++ b/benchmark/text-generation/llama2-7b.py @@ -8,7 +8,7 @@ def main(): - NUM_CORES = 24 + NUM_CORES = 8 num_cores = get_available_cores() if num_cores < NUM_CORES: raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.") diff --git a/benchmark/text-generation/llama3-8b.py b/benchmark/text-generation/llama3-8b.py new file mode 100644 index 000000000..8fc107f2a --- /dev/null +++ b/benchmark/text-generation/llama3-8b.py @@ -0,0 +1,43 @@ +from tempfile import TemporaryDirectory + +from transformers import AutoTokenizer + +from benchmark import run +from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron.modeling_decoder import get_available_cores + + +def main(): + NUM_CORES = 8 + num_cores = get_available_cores() + if num_cores < NUM_CORES: + raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.") + + model_configurations = { + "Llama-3-8B-BS1": ["meta-llama/Meta-Llama-3-8B", 1, 4096], + "Llama-3-8B-BS4": ["meta-llama/Meta-Llama-3-8B", 4, 4096], + "Llama-3-8B-BS8": ["meta-llama/Meta-Llama-3-8B", 8, 4096], + "Llama-3-8B-BS16": ["meta-llama/Meta-Llama-3-8B", 16, 4096], + "Llama-3-8B-BS32": ["meta-llama/Meta-Llama-3-8B", 32, 4096], + } + + for model_name, model_configuration in model_configurations.items(): + model_id, batch_size, seq_length = model_configuration + model = NeuronModelForCausalLM.from_pretrained( + model_id, + export=True, + batch_size=batch_size, + sequence_length=seq_length, + auto_cast_type="fp16", + num_cores=NUM_CORES, + ) + with TemporaryDirectory() as tmpdir: + model.save_pretrained(tmpdir) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.save_pretrained(tmpdir) + json_path = f"{model_name}.json" + run(tmpdir, 256, 2048, json_path=json_path) + + +if __name__ == "__main__": + main() diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/latency.png b/docs/assets/benchmarks/inferentia-llama2-13b/latency.png index 5032ecc2e..9f43062dc 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-13b/latency.png and b/docs/assets/benchmarks/inferentia-llama2-13b/latency.png differ diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png index 2b0684f95..bfd5fc07b 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png and b/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png differ diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png index c51a2f795..2dca870b0 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png and b/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png differ diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png index 54427b05e..2f657938b 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png and b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png differ diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png index 0ff02b86a..955ee60de 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png and b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png differ diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png index 52e48ffcf..8844a75e7 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png and b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png differ diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png new file mode 100644 index 000000000..b0af4a39e Binary files /dev/null and b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png differ diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png new file mode 100644 index 000000000..37deedd97 Binary files /dev/null and b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png differ diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png new file mode 100644 index 000000000..2dd4abe77 Binary files /dev/null and b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png differ diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/latency.png b/docs/assets/benchmarks/inferentia-mistral-v2/latency.png index af760cd7e..a02ce879a 100644 Binary files a/docs/assets/benchmarks/inferentia-mistral-v2/latency.png and b/docs/assets/benchmarks/inferentia-mistral-v2/latency.png differ diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png b/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png index 792eb4d91..ad940c54d 100644 Binary files a/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png and b/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png differ diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png b/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png index c7945b88d..6069cc5bc 100644 Binary files a/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png and b/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png differ diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 7f42ff8f6..ca4d1cb73 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -50,6 +50,8 @@ title: Llama2 13b on AWS Inferentia2 - local: benchmarks/inferentia-mistral-v2 title: Mistral v0.2 7b on AWS Inferentia2 + - local: benchmarks/inferentia-llama3-8b + title: Llama-3 8B on AWS Inferentia2 title: Benchmarks - sections: - local: community/contributing diff --git a/docs/source/benchmarks/inferentia-llama2-13b.mdx b/docs/source/benchmarks/inferentia-llama2-13b.mdx index 2d71efd6a..d268bffb8 100644 --- a/docs/source/benchmarks/inferentia-llama2-13b.mdx +++ b/docs/source/benchmarks/inferentia-llama2-13b.mdx @@ -22,13 +22,12 @@ For this benchmark we will use the following configurations: | Model type | batch_size | sequence_length | |-----------------|------------|-----------------| -| Llama2 13b BS1 | 1 | 4096 | -| Llama2 13b BS4 | 4 | 4096 | -| Llama2 13b BS8 | 8 | 4096 | -| Llama2 13b BS16 | 16 | 4096 | -| Llama2 13b BS32 | 32 | 4096 | +| Llama2 13B BS1 | 1 | 4096 | +| Llama2 13B BS4 | 4 | 4096 | +| Llama2 13B BS8 | 8 | 4096 | +| Llama2 13B BS16 | 16 | 4096 | -*Note: all models are compiled to use all 12 devices corresponding to 24 cores on the `inf2.48xlarge` instance.* +*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.* *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.* diff --git a/docs/source/benchmarks/inferentia-llama2-7b.mdx b/docs/source/benchmarks/inferentia-llama2-7b.mdx index faf978563..6503a9d80 100644 --- a/docs/source/benchmarks/inferentia-llama2-7b.mdx +++ b/docs/source/benchmarks/inferentia-llama2-7b.mdx @@ -28,7 +28,7 @@ For this benchmark we will use the following configurations: | Llama2 7B BS16 | 16 | 4096 | | Llama2 7B BS32 | 32 | 4096 | -*Note: all models are compiled to use all 12 devices corresponding to 24 cores on the `inf2.48xlarge` instance.* +*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.* *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.* diff --git a/docs/source/benchmarks/inferentia-llama3-8b.mdx b/docs/source/benchmarks/inferentia-llama3-8b.mdx new file mode 100644 index 000000000..fd471d8ef --- /dev/null +++ b/docs/source/benchmarks/inferentia-llama3-8b.mdx @@ -0,0 +1,61 @@ + + +# Llama-3-8b performance on AWS Inferentia2 (Latency & Througput) + +How fast is Llama-3-8b on Inferentia2? Let's figure out! + +For this benchmark we will use the following configurations: + +| Model type | batch_size | sequence_length | +|----------------|------------|-----------------| +| Llama3 8b BS1 | 1 | 4096 | +| Llama3 8b BS4 | 4 | 4096 | +| Llama3 8b BS8 | 8 | 4096 | +| Llama3 8b BS16 | 16 | 4096 | +| Llama3 8b BS32 | 32 | 4096 | + +*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.* + +*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.* + +## Time to first token + +The time to first token is the time required to process the input tokens and generate the first output token. +It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens. + +We test the time to first token for increasing context sizes, from a typical Q/A usage, to heavy Retrieval Augmented Generation (RAG) use-cases. + +Time to first token is expressed in **seconds**. + +![Llama3 8b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png "Time to first token") + +## Inter-token Latency + +The inter-token latency corresponds to the average time elapsed between two generated tokens. + +It is expressed in **milliseconds**. + +![Llama3 8b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/latency.png "Inter-token latency") + +### Throughput + +Unlike some other benchmarks, we evaluate the throughput using generated tokens only, by dividing their number +by the end-to-end latency. + +Throughput is expressed in **tokens/second**. + +![Llama3 8b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png "Throughput") diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index a9d8c1dba..25ba069bf 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -387,7 +387,8 @@ class NeuronDecoderConfig(NeuronConfig): be passed to export the model, - NEURONX_CLASS (`str`) -- the name of the transformers-neuronx class to instantiate for the model. It is a full class name defined relatively to the transformers-neuronx module, e.g. `gpt2.model.GPT2ForSampling` - - CONTINUOUS_BATCHING (`bool`, , defaults to `False`) -- Whether the model supports continuous batching or not. + - CONTINUOUS_BATCHING (`bool`, defaults to `False`) -- Whether the model supports continuous batching or not. + - ATTENTION_LAYOUT (`str`, defaults to `HSB`) -- Layout to be used for attention computation. The NEURONX_CLASS must always be defined in each model configuration. @@ -398,6 +399,7 @@ class NeuronDecoderConfig(NeuronConfig): INPUT_ARGS = ("batch_size", "sequence_length") NEURONX_CLASS = None CONTINUOUS_BATCHING = False + ATTENTION_lAYOUT = "HSB" def __init__(self, task: str): if not is_transformers_neuronx_available(): @@ -417,3 +419,7 @@ def neuronx_class(self): @property def continuous_batching(self): return self.CONTINUOUS_BATCHING + + @property + def attention_layout(self): + return self.ATTENTION_lAYOUT diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index d528a7d22..684143839 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -461,6 +461,7 @@ class GPT2NeuronConfig(TextNeuronDecoderConfig): class LLamaNeuronConfig(TextNeuronDecoderConfig): NEURONX_CLASS = "llama.model.LlamaForSampling" CONTINUOUS_BATCHING = True + ATTENTION_lAYOUT = "BSH" @register_in_tasks_manager("t5-encoder", "text2text-generation") diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py index bd6ddf391..3b37a6f78 100644 --- a/optimum/neuron/modeling_decoder.py +++ b/optimum/neuron/modeling_decoder.py @@ -179,11 +179,13 @@ def __init__( # Continuous batching is always enabled for models that support it because static batching # is broken for these models: see https://github.com/aws-neuron/transformers-neuronx/issues/79 tnx_kwargs["neuron_config"] = NeuronConfig( - continuous_batching=ContinuousBatchingConfig(batch_size_for_shared_caches=batch_size) + continuous_batching=ContinuousBatchingConfig(batch_size_for_shared_caches=batch_size), + attention_layout=exporter.attention_layout, ) tnx_kwargs["n_positions"] = [sequence_length] tnx_kwargs["context_length_estimate"] = [sequence_length] else: + tnx_kwargs["neuron_config"] = NeuronConfig(attention_layout=exporter.attention_layout) tnx_kwargs["n_positions"] = sequence_length # Instantiate neuronx model diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile index 4bb513ca8..2637f01d1 100644 --- a/text-generation-inference/server/Makefile +++ b/text-generation-inference/server/Makefile @@ -2,7 +2,7 @@ pkg_name := text_generation_server BUILDDIR ?= $(CURDIR)/build VERSION ?= 0.0.1 -TGI_VERSION ?= 2.0.1 +TGI_VERSION ?= 2.0.2 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) mkfile_dir := $(dir $(mkfile_path)) pkg_dir := $(BUILDDIR)/$(pkg_name)