From 9739b8cb8442dfe1b96744e3fa4c0ae64957af8d Mon Sep 17 00:00:00 2001
From: pagezyhf <simon.pagezy@huggingface.co>
Date: Wed, 26 Feb 2025 15:53:26 +0100
Subject: [PATCH] add sections and first example of SageMaker inference

---
 docs/source/_toctree.yml                      |  47 ++-
 .../sagemaker/deploy-llama-3-3-70b.mdx        | 202 ++++++++++
 docs/source/training_tutorials/notebooks.mdx  |   2 +-
 .../sagemaker/deploy-llama-3-3-70b.ipynb      | 344 ++++++++++++++++++
 4 files changed, 577 insertions(+), 18 deletions(-)
 create mode 100644 docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx
 create mode 100644 notebooks/sagemaker/deploy-llama-3-3-70b.ipynb

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 2c809b01f..a849e851b 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -8,23 +8,36 @@
   - local: containers
     title: Optimum Containers
   - sections:
-    - local: training_tutorials/notebooks
-      title: Notebooks
-    - local: training_tutorials/fine_tune_bert
-      title: Fine-tune BERT for Text Classification on AWS Trainium
-    - local: training_tutorials/sft_lora_finetune_llm
-      title: Fine-tune Llama 3 8B on with LoRA and the SFTTrainer
-    title: Training Tutorials
-  - sections:
-    - local: inference_tutorials/notebooks
-      title: Notebooks
-    - local: inference_tutorials/llama2-13b-chatbot
-      title: Create your own chatbot with llama-2-13B on AWS Inferentia
-    - local: inference_tutorials/sentence_transformers
-      title: Sentence Transformers on AWS Inferentia
-    - local: inference_tutorials/stable_diffusion
-      title: Generate images with Stable Diffusion models on AWS Inferentia
-    title: Inference Tutorials
+    - sections:
+      - isExpanded: false
+        sections:
+        - local: training_tutorials/notebooks
+          title: Notebooks
+        - local: training_tutorials/fine_tune_bert
+          title: Fine-tune BERT for Text Classification
+        - local: training_tutorials/sft_lora_finetune_llm
+          title: Fine-tune Llama 3 8B with LoRA and the SFTTrainer
+        title: EC2
+      title: Training Tutorials
+    - sections:
+      - isExpanded: false
+        sections:
+        - local: inference_tutorials/notebooks
+          title: Notebooks
+        - local: inference_tutorials/llama2-13b-chatbot
+          title: Create your own chatbot with llama-2-13B on AWS Inferentia
+        - local: inference_tutorials/sentence_transformers
+          title: Sentence Transformers on AWS Inferentia
+        - local: inference_tutorials/stable_diffusion
+          title: Generate images with Stable Diffusion models on AWS Inferentia
+        title: EC2
+      - isExpanded: false
+        sections:
+        - local: inference_tutorials/sagemaker/deploy-llama-3-3-70b
+          title: Deploy Llama 3.3 70B on AWS Inferentia2 with SageMaker
+        title: SageMaker
+      title: Inference Tutorials
+    title: Tutorials
   - sections:
     - local: guides/setup_aws_instance
       title: Set up AWS Trainium instance
diff --git a/docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx b/docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx
new file mode 100644
index 000000000..46f49fe2b
--- /dev/null
+++ b/docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx
@@ -0,0 +1,202 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Deploy Llama 3.3 70B on AWS Inferentia2
+
+_There is a notebook version of that tutorial [here](https://github.com/huggingface/optimum-neuron/blob/main/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb)._
+
+In this tutorial you will learn how to deploy [/meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) model on AWS Inferentia2 with Hugging Face Optimum on Amazon SageMaker. We are going to use the Hugging Face TGI Neuron Container, a purpose-built Inference Container to easily deploy LLMs on AWS Inferentia2 powered by[ Text Generation Inference](https://huggingface.co/docs/text-generation-inference/index) and [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index).
+
+We will cover how to:
+1. [Setup development environment](#1-setup-development-environment)
+2. [Retrieve the new Hugging Face TGI Neuron DLC](#2-retrieve-the-new-hugging-face-tgi-neuron-dlc)
+3. [Deploy Llama 3.3 70B to inferentia2](#3-deploy-llama-33-70b-to-inferentia2)
+4. [Clean up](#5-clean-up)
+
+Lets get started! 🚀
+
+[AWS inferentia (Inf2)](https://aws.amazon.com/de/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Here are the different instances of the Inferentia2 family.
+
+| instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) |
+| ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- |
+| inf2.xlarge   | 1            | 2            | 32                 | 4    | 16         | 0.76                  |
+| inf2.8xlarge  | 1            | 2            | 32                 | 32   | 128        | 1.97                  |
+| inf2.24xlarge | 6            | 12           | 192                | 96   | 384        | 6.49                  |
+| inf2.48xlarge | 12           | 24           | 384                | 192  | 768        | 12.98                 |
+
+## 1. Setup development environment
+
+For this tutorial, we are going to use a Notebook Instance in Amazon SageMaker with the Python 3 (ipykernel) and the `sagemaker` python SDK to deploy Llama 3.3 70B to a SageMaker inference endpoint.
+
+Make sur you have the latest version of the SageMaker SDK installed.
+
+```bash
+!pip install sagemaker --upgrade --quiet
+```
+
+Then, instantiate the sagemaker role and session.
+
+```python
+import sagemaker
+import boto3
+sess = sagemaker.Session()
+# sagemaker session bucket -> used for uploading data, models and logs
+# sagemaker will automatically create this bucket if it not exists
+sagemaker_session_bucket=None
+if sagemaker_session_bucket is None and sess is not None:
+    # set to default bucket if a bucket name is not given
+    sagemaker_session_bucket = sess.default_bucket()
+
+try:
+    role = sagemaker.get_execution_role()
+except ValueError:
+    iam = boto3.client('iam')
+    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
+
+sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
+
+print(f"sagemaker role arn: {role}")
+print(f"sagemaker session region: {sess.boto_region_name}")
+```
+
+## 2. Retrieve the latest Hugging Face TGI Neuron DLC
+
+The latest Hugging Face TGI Neuron DLCs can be used to run inference on AWS Inferentia2. You can use the `get_huggingface_llm_image_uri` method of the `sagemaker` SDK to retrieve the appropriate Hugging Face TGI Neuron DLC URI based on your desired `backend`, `session`, `region`, and `version`. You can find the latest version of the container [here](https://huggingface.co/docs/optimum-neuron/containers), if not yet added to the SageMaker SDK.
+
+```python
+from sagemaker.huggingface import get_huggingface_llm_image_uri
+
+# retrieve the llm image uri
+llm_image = get_huggingface_llm_image_uri(
+  "huggingface-neuronx",
+)
+
+print(f"llm image uri: {llm_image}")
+```
+
+## 3. Deploy Llama 3.3 70B to Inferentia2
+
+At the time of writing, [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/v2.6.0/general/arch/neuron-features/dynamic-shapes.html#neuron-dynamic-shapes), which means that we need to specify our sequence length and batch size ahead of time.
+To make it easier for customers to utilize the full power of Inferentia2, we created a [neuron model cache](https://huggingface.co/docs/optimum-neuron/guides/cache_system), which contains pre-compiled configurations for the most popular LLMs, including Llama 3.3 70B. 
+
+This means we don't need to compile the model ourselves, but we can use the pre-compiled model from the cache. You can find compiled/cached configurations on the [Hugging Face Hub](https://huggingface.co/aws-neuron/optimum-neuron-cache/tree/main/inference-cache-config). If your desired configuration is not yet cached, you can compile it yourself using the [Optimum CLI](https://huggingface.co/docs/optimum-neuron/guides/export_model) or open a request at the [Cache repository](https://huggingface.co/aws-neuron/optimum-neuron-cache/discussions).
+
+**Deploying Llama 3.3 70B to a SageMaker Endpoint**   
+
+Before deploying the model to Amazon SageMaker, we must define the TGI Neuron endpoint configuration. We need to make sure the following additional parameters are defined: 
+
+- `HF_NUM_CORES`: Number of Neuron Cores used for the compilation.
+- `HF_BATCH_SIZE`: The batch size that was used to compile the model.
+- `HF_SEQUENCE_LENGTH`: The sequence length that was used to compile the model.
+- `HF_AUTO_CAST_TYPE`: The auto cast type that was used to compile the model.
+
+We still need to define traditional TGI parameters with:
+
+- `HF_MODEL_ID`: The Hugging Face model ID.
+- `HF_TOKEN`: The Hugging Face API token to access gated models.
+- `MAX_BATCH_SIZE`: The maximum batch size that the model can handle, equal to the batch size used for compilation.
+- `MAX_INPUT_LENGTH`: The maximum input length that the model can handle. 
+- `MAX_TOTAL_TOKENS`: The maximum total tokens the model can generate, equal to the sequence length used for compilation.
+
+Optionnaly, you can configure the endpoint to support chat templates:
+- `MESSAGES_API_ENABLED`: Enable Messages API 
+
+**Select the right instance type**
+
+Llama 3.3 70B is a large model and requires a lot of memory. We are going to use the `inf2.48xlarge` instance type, which has 192 vCPUs and 384 GB of accelerator memory. The `inf2.48xlarge` instance comes with 12 Inferentia2 accelerators that include 24 Neuron Cores. If you want to find the cached configurations for Llama 3.3 70B, you can find them [here](https://huggingface.co/aws-neuron/optimum-neuron-cache/blob/main/inference-cache-config/llama3-70b.json#L16). In our case we will use a batch size of 4 and a sequence length of 4096. 
+
+
+Before we can deploy Llama 3.3 70B to Inferentia2, we need to make sure we have the necessary permissions to access the model. You can request access to the model [here](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) and create a User access token following this [guide](https://huggingface.co/docs/hub/en/security-tokens).
+
+After that we can create our endpoint configuration and deploy the model to Amazon SageMaker.
+
+```python
+from sagemaker.huggingface import HuggingFaceModel
+
+# sagemaker config
+instance_type = "ml.inf2.48xlarge"
+health_check_timeout=2400 # additional time to load the model
+volume_size=512 # size in GB of the EBS volume
+
+# Define Model and Endpoint configuration parameter
+config = {
+    "HF_MODEL_ID": "meta-llama/Meta-Llama-3-70B-Instruct",
+    "HF_NUM_CORES": "24", # number of neuron cores
+    "HF_AUTO_CAST_TYPE": "bf16",  # dtype of the model
+    "MAX_BATCH_SIZE": "4", # max batch size for the model
+    "MAX_INPUT_TOKENS": "4000", # max length of input text
+    "MAX_TOTAL_TOKENS": "4096", # max length of generated text
+    "MESSAGES_API_ENABLED": "true", # Enable the messages API
+    "HF_TOKEN": "<REPLACE WITH YOUR TOKEN>",
+}
+
+assert config["HF_TOKEN"] != "<REPLACE WITH YOUR TOKEN>", "Please replace '<REPLACE WITH YOUR TOKEN>' with your Hugging Face Hub API token"
+
+# create HuggingFaceModel with the image uri
+llm_model = HuggingFaceModel(
+  role=role,
+  image_uri=llm_image,
+  env=config
+)
+```
+
+After we have created the `HuggingFaceModel` we can deploy it to Amazon SageMaker using the `deploy` method. We will deploy the model with the `ml.inf2.48xlarge` instance type. TGI will automatically distribute and shard the model across all Inferentia devices.
+
+```python
+# deactivate warning since model is compiled
+llm_model._is_compiled_model = True
+
+llm = llm_model.deploy(
+  initial_instance_count=1,
+  instance_type=instance_type,
+  container_startup_health_check_timeout=health_check_timeout,
+  volume_size=volume_size
+)
+```
+
+SageMaker will now create our endpoint and deploy the model to it. It takes around 30-40 minutes, we are working on improving the deployment time.
+
+After our endpoint is deployed we can run inference on it. We will use the `predict` method from the `predictor` to run inference on our endpoint. We can inference with different parameters to impact the generation. Parameters can be defined as in the `parameters` attribute of the payload. You can find supported parameters in the [here](https://huggingface.co/docs/text-generation-inference/messages_api).
+
+The Messages API allows us to interact with the model in a conversational way. We can define the role of the message and the content. The role can be either `system`,`assistant` or `user`. The `system` role is used to provide context to the model and the `user` role is used to ask questions or provide input to the model.
+
+```python
+# Prompt to generate
+messages=[
+    { "role": "system", "content": "You are a helpful assistant." },
+    { "role": "user", "content": "What is deep learning?" }
+]
+
+# Generation arguments
+parameters = {
+    "top_p": 0.6,
+    "temperature": 0.9,
+    "max_tokens": 50,
+    "stop": ["<|eot_id|>"],
+}
+
+chat = llm.predict({"messages" :messages, **parameters,"steam":True})
+
+print(chat["choices"][0]["message"]["content"].strip())
+```
+
+## 4. Clean up
+
+To clean up, we can delete the model and endpoint.
+
+```python
+llm.delete_model()
+llm.delete_endpoint()
+```
\ No newline at end of file
diff --git a/docs/source/training_tutorials/notebooks.mdx b/docs/source/training_tutorials/notebooks.mdx
index 2916d1c3f..1bc9dad2f 100644
--- a/docs/source/training_tutorials/notebooks.mdx
+++ b/docs/source/training_tutorials/notebooks.mdx
@@ -20,5 +20,5 @@ We prepared some notebooks for you, so that you can run directly tutorials in th
 
 | Notebook                                                                                                                                                                                | Description                                                                                                                                                                       |        Studio Lab                                                                                                                                                                                                       |
 |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
-| [Fine-tune BERT for text classification on AWS Trainium](https://github.com/huggingface/optimum-neuron/blob/main/notebooks/text-classification/notebook.ipynb)                          | Show how to fine-tune BERT on AWS Trainium for text classification.                                                                                                               | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-neuron/blob/main/notebooks/text-classification/notebook.ipynb)                 |
+| [Fine-tune BERT for text classification on AWS Trainium](https://github.com/huggingface/optimum-neuron/blob/main/notebooks/text-classification/fine_tune_bert.ipynb)                          | Show how to fine-tune BERT on AWS Trainium for text classification.                                                                                                               | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-neuron/blob/main/notebooks/text-classification/fine_tune_bert.ipynb)                 |
 
diff --git a/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb b/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb
new file mode 100644
index 000000000..add702008
--- /dev/null
+++ b/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb
@@ -0,0 +1,344 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Deploy Llama 3.3 70B on AWS Inferentia2\n",
+    "\n",
+    "In this tutorial you will learn how to deploy [/meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) model on AWS Inferentia2 with Hugging Face Optimum on Amazon SageMaker. We are going to use the Hugging Face TGI Neuron Container, a purpose-built Inference Container to easily deploy LLMs on AWS Inferentia2 powered by[ Text Generation Inference](https://huggingface.co/docs/text-generation-inference/index) and [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index).\n",
+    "\n",
+    "\n",
+    "We will cover how to:\n",
+    "1. [Setup development environment](#1-setup-development-environment)\n",
+    "2. [Retrieve the new Hugging Face TGI Neuron DLC](#2-retrieve-the-new-hugging-face-tgi-neuron-dlc)\n",
+    "3. [Deploy Llama 3.3 70B to inferentia2](#3-deploy-llama-33-70b-to-inferentia2)\n",
+    "4. [Clean up](#4-clean-up)\n",
+    "\n",
+    "Lets get started! 🚀\n",
+    "\n",
+    "[AWS inferentia (Inf2)](https://aws.amazon.com/de/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Here are the different instances of the Inferentia2 family.\n",
+    "\n",
+    "| instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) |\n",
+    "| ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- |\n",
+    "| inf2.xlarge   | 1            | 2            | 32                 | 4    | 16         | 0.76                  |\n",
+    "| inf2.8xlarge  | 1            | 2            | 32                 | 32   | 128        | 1.97                  |\n",
+    "| inf2.24xlarge | 6            | 12           | 192                | 96   | 384        | 6.49                  |\n",
+    "| inf2.48xlarge | 12           | 24           | 384                | 192  | 768        | 12.98                 |\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup development environment\n",
+    "\n",
+    "For this tutorial, we are going to use a Notebook Instance in Amazon SageMaker with the Python 3 (ipykernel) and the `sagemaker` python SDK to deploy Llama 3.3 70B to a SageMaker inference endpoint.\n",
+    "\n",
+    "Make sur you have the latest version of the SageMaker SDK installed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install sagemaker --upgrade --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, instantiate the sagemaker role and session."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "import sagemaker\n",
+    "import boto3\n",
+    "sess = sagemaker.Session()\n",
+    "# sagemaker session bucket -> used for uploading data, models and logs\n",
+    "# sagemaker will automatically create this bucket if it not exists\n",
+    "sagemaker_session_bucket=None\n",
+    "if sagemaker_session_bucket is None and sess is not None:\n",
+    "    # set to default bucket if a bucket name is not given\n",
+    "    sagemaker_session_bucket = sess.default_bucket()\n",
+    "\n",
+    "try:\n",
+    "    role = sagemaker.get_execution_role()\n",
+    "except ValueError:\n",
+    "    iam = boto3.client('iam')\n",
+    "    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n",
+    "\n",
+    "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
+    "\n",
+    "print(f\"sagemaker role arn: {role}\")\n",
+    "print(f\"sagemaker session region: {sess.boto_region_name}\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Retrieve the latest Hugging Face TGI Neuron DLC\n",
+    "\n",
+    "The latest Hugging Face TGI Neuron DLCs can be used to run inference on AWS Inferentia2. You can use the `get_huggingface_llm_image_uri` method of the `sagemaker` SDK to retrieve the appropriate Hugging Face TGI Neuron DLC URI based on your desired `backend`, `session`, `region`, and `version`. You can find the latest version of the container [here](https://huggingface.co/docs/optimum-neuron/containers), if not yet added to the SageMaker SDK."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import get_huggingface_llm_image_uri\n",
+    "\n",
+    "# retrieve the llm image uri\n",
+    "llm_image = get_huggingface_llm_image_uri(\n",
+    "  \"huggingface-neuronx\",\n",
+    ")\n",
+    "\n",
+    "print(f\"llm image uri: {llm_image}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Deploy Llama 3.3 70B to Inferentia2\n",
+    "\n",
+    "At the time of writing, [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/v2.6.0/general/arch/neuron-features/dynamic-shapes.html#neuron-dynamic-shapes), which means that we need to specify our sequence length and batch size ahead of time.\n",
+    "To make it easier for customers to utilize the full power of Inferentia2, we created a [neuron model cache](https://huggingface.co/docs/optimum-neuron/guides/cache_system), which contains pre-compiled configurations for the most popular LLMs, including Llama 3.3 70B. \n",
+    "\n",
+    "This means we don't need to compile the model ourselves, but we can use the pre-compiled model from the cache. You can find compiled/cached configurations on the [Hugging Face Hub](https://huggingface.co/aws-neuron/optimum-neuron-cache/tree/main/inference-cache-config). If your desired configuration is not yet cached, you can compile it yourself using the [Optimum CLI](https://huggingface.co/docs/optimum-neuron/guides/export_model) or open a request at the [Cache repository](https://huggingface.co/aws-neuron/optimum-neuron-cache/discussions).\n",
+    "\n",
+    "**Deploying Llama 3.3 70B to a SageMaker Endpoint**  \n",
+    "\n",
+    "Before deploying the model to Amazon SageMaker, we must define the TGI Neuron endpoint configuration. We need to make sure the following additional parameters are defined: \n",
+    "\n",
+    "- `HF_NUM_CORES`: Number of Neuron Cores used for the compilation.\n",
+    "- `HF_BATCH_SIZE`: The batch size that was used to compile the model.\n",
+    "- `HF_SEQUENCE_LENGTH`: The sequence length that was used to compile the model.\n",
+    "- `HF_AUTO_CAST_TYPE`: The auto cast type that was used to compile the model.\n",
+    "\n",
+    "We still need to define traditional TGI parameters with:\n",
+    "\n",
+    "- `HF_MODEL_ID`: The Hugging Face model ID.\n",
+    "- `HF_TOKEN`: The Hugging Face API token to access gated models.\n",
+    "- `MAX_BATCH_SIZE`: The maximum batch size that the model can handle, equal to the batch size used for compilation.\n",
+    "- `MAX_INPUT_TOKEN`: The maximum input length that the model can handle. \n",
+    "- `MAX_TOTAL_TOKENS`: The maximum total tokens the model can generate, equal to the sequence length used for compilation.\n",
+    "\n",
+    "Optionnaly, you can configure the endpoint to support chat templates:\n",
+    "- `MESSAGES_API_ENABLED`: Enable Messages API \n",
+    "\n",
+    "**Select the right instance type**\n",
+    "\n",
+    "Llama 3.3 70B is a large model and requires a lot of memory. We are going to use the `inf2.48xlarge` instance type, which has 192 vCPUs and 384 GB of accelerator memory. The `inf2.48xlarge` instance comes with 12 Inferentia2 accelerators that include 24 Neuron Cores. If you want to find the cached configurations for Llama 3.3 70B, you can find them [here](https://huggingface.co/aws-neuron/optimum-neuron-cache/blob/main/inference-cache-config/llama3-70b.json#L16). In our case we will use a batch size of 4 and a sequence length of 4096. \n",
+    "\n",
+    "\n",
+    "Before we can deploy Llama 3.3 70B to Inferentia2, we need to make sure we have the necessary permissions to access the model. You can request access to the model [here](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) and create a User access token following this [guide](https://huggingface.co/docs/hub/en/security-tokens).\n",
+    "\n",
+    "\n",
+    "After that we can create our endpoint configuration and deploy the model to Amazon SageMaker."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import HuggingFaceModel\n",
+    "\n",
+    "# sagemaker config\n",
+    "instance_type = \"ml.inf2.48xlarge\"\n",
+    "health_check_timeout=2400 # additional time to load the model\n",
+    "volume_size=512 # size in GB of the EBS volume\n",
+    "\n",
+    "# Define Model and Endpoint configuration parameter\n",
+    "config = {\n",
+    "    \"HF_MODEL_ID\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n",
+    "    \"HF_NUM_CORES\": \"24\", # number of neuron cores\n",
+    "    \"HF_AUTO_CAST_TYPE\": \"bf16\",  # dtype of the model\n",
+    "    \"MAX_BATCH_SIZE\": \"4\", # max batch size for the model\n",
+    "    \"MAX_INPUT_TOKENS\": \"4000\", # max length of input text\n",
+    "    \"MAX_TOTAL_TOKENS\": \"4096\", # max length of generated text\n",
+    "    \"MESSAGES_API_ENABLED\": \"true\", # Enable the messages API\n",
+    "    \"HF_TOKEN\": \"<REPLACE WITH YOUR TOKEN>\",\n",
+    "}\n",
+    "\n",
+    "assert config[\"HF_TOKEN\"] != \"<REPLACE WITH YOUR TOKEN>\", \"Please replace '<REPLACE WITH YOUR TOKEN>' with your Hugging Face Hub API token\"\n",
+    "\n",
+    "\n",
+    "# create HuggingFaceModel with the image uri\n",
+    "llm_model = HuggingFaceModel(\n",
+    "  role=role,\n",
+    "  image_uri=llm_image,\n",
+    "  env=config\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After we have created the `HuggingFaceModel` we can deploy it to Amazon SageMaker using the `deploy` method. We will deploy the model with the `ml.inf2.48xlarge` instance type. TGI will automatically distribute and shard the model across all Inferentia devices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "# deactivate warning since model is compiled\n",
+    "llm_model._is_compiled_model = True\n",
+    "\n",
+    "llm = llm_model.deploy(\n",
+    "  initial_instance_count=1,\n",
+    "  instance_type=instance_type,\n",
+    "  container_startup_health_check_timeout=health_check_timeout,\n",
+    "  volume_size=volume_size\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SageMaker will now create our endpoint and deploy the model to it. It takes around 30-40 minutes, we are working on improving the deployment time."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After our endpoint is deployed we can run inference on it. We will use the `predict` method from the `predictor` to run inference on our endpoint. We can inference with different parameters to impact the generation. Parameters can be defined as in the `parameters` attribute of the payload. You can find supported parameters in the [here](https://huggingface.co/docs/text-generation-inference/messages_api).\n",
+    "\n",
+    "The Messages API allows us to interact with the model in a conversational way. We can define the role of the message and the content. The role can be either `system`,`assistant` or `user`. The `system` role is used to provide context to the model and the `user` role is used to ask questions or provide input to the model.\n",
+    "\n",
+    "```json\n",
+    "{\n",
+    "  \"messages\": [\n",
+    "    { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n",
+    "    { \"role\": \"user\", \"content\": \"What is deep learning?\" }\n",
+    "  ]\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "# Prompt to generate\n",
+    "messages=[\n",
+    "    { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n",
+    "    { \"role\": \"user\", \"content\": \"What is deep learning?\" }\n",
+    "]\n",
+    "\n",
+    "# Generation arguments\n",
+    "parameters = {\n",
+    "    \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", # placholder, needed\n",
+    "    \"top_p\": 0.6,\n",
+    "    \"temperature\": 0.9,\n",
+    "    \"max_tokens\": 50,\n",
+    "    \"stop\": [\"<|eot_id|>\"],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Okay lets test it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [],
+   "source": [
+    "chat = llm.predict({\"messages\" :messages, **parameters,\"steam\":True})\n",
+    "\n",
+    "print(chat[\"choices\"][0][\"message\"][\"content\"].strip())"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Clean up\n",
+    "\n",
+    "To clean up, we can delete the model and endpoint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.delete_model()\n",
+    "llm.delete_endpoint()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hf",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5fcf248a74081676ead7e77f54b2c239ba2921b952f7cbcdbbe5427323165924"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}