From 9739b8cb8442dfe1b96744e3fa4c0ae64957af8d Mon Sep 17 00:00:00 2001 From: pagezyhf Date: Wed, 26 Feb 2025 15:53:26 +0100 Subject: [PATCH] add sections and first example of SageMaker inference --- docs/source/_toctree.yml | 47 ++- .../sagemaker/deploy-llama-3-3-70b.mdx | 202 ++++++++++ docs/source/training_tutorials/notebooks.mdx | 2 +- .../sagemaker/deploy-llama-3-3-70b.ipynb | 344 ++++++++++++++++++ 4 files changed, 577 insertions(+), 18 deletions(-) create mode 100644 docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx create mode 100644 notebooks/sagemaker/deploy-llama-3-3-70b.ipynb diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 2c809b01f..a849e851b 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -8,23 +8,36 @@ - local: containers title: Optimum Containers - sections: - - local: training_tutorials/notebooks - title: Notebooks - - local: training_tutorials/fine_tune_bert - title: Fine-tune BERT for Text Classification on AWS Trainium - - local: training_tutorials/sft_lora_finetune_llm - title: Fine-tune Llama 3 8B on with LoRA and the SFTTrainer - title: Training Tutorials - - sections: - - local: inference_tutorials/notebooks - title: Notebooks - - local: inference_tutorials/llama2-13b-chatbot - title: Create your own chatbot with llama-2-13B on AWS Inferentia - - local: inference_tutorials/sentence_transformers - title: Sentence Transformers on AWS Inferentia - - local: inference_tutorials/stable_diffusion - title: Generate images with Stable Diffusion models on AWS Inferentia - title: Inference Tutorials + - sections: + - isExpanded: false + sections: + - local: training_tutorials/notebooks + title: Notebooks + - local: training_tutorials/fine_tune_bert + title: Fine-tune BERT for Text Classification + - local: training_tutorials/sft_lora_finetune_llm + title: Fine-tune Llama 3 8B with LoRA and the SFTTrainer + title: EC2 + title: Training Tutorials + - sections: + - isExpanded: false + sections: + - local: inference_tutorials/notebooks + title: Notebooks + - local: inference_tutorials/llama2-13b-chatbot + title: Create your own chatbot with llama-2-13B on AWS Inferentia + - local: inference_tutorials/sentence_transformers + title: Sentence Transformers on AWS Inferentia + - local: inference_tutorials/stable_diffusion + title: Generate images with Stable Diffusion models on AWS Inferentia + title: EC2 + - isExpanded: false + sections: + - local: inference_tutorials/sagemaker/deploy-llama-3-3-70b + title: Deploy Llama 3.3 70B on AWS Inferentia2 with SageMaker + title: SageMaker + title: Inference Tutorials + title: Tutorials - sections: - local: guides/setup_aws_instance title: Set up AWS Trainium instance diff --git a/docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx b/docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx new file mode 100644 index 000000000..46f49fe2b --- /dev/null +++ b/docs/source/inference_tutorials/sagemaker/deploy-llama-3-3-70b.mdx @@ -0,0 +1,202 @@ + +# Deploy Llama 3.3 70B on AWS Inferentia2 + +_There is a notebook version of that tutorial [here](https://github.com/huggingface/optimum-neuron/blob/main/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb)._ + +In this tutorial you will learn how to deploy [/meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) model on AWS Inferentia2 with Hugging Face Optimum on Amazon SageMaker. We are going to use the Hugging Face TGI Neuron Container, a purpose-built Inference Container to easily deploy LLMs on AWS Inferentia2 powered by[ Text Generation Inference](https://huggingface.co/docs/text-generation-inference/index) and [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index). + +We will cover how to: +1. [Setup development environment](#1-setup-development-environment) +2. [Retrieve the new Hugging Face TGI Neuron DLC](#2-retrieve-the-new-hugging-face-tgi-neuron-dlc) +3. [Deploy Llama 3.3 70B to inferentia2](#3-deploy-llama-33-70b-to-inferentia2) +4. [Clean up](#5-clean-up) + +Lets get started! 🚀 + +[AWS inferentia (Inf2)](https://aws.amazon.com/de/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Here are the different instances of the Inferentia2 family. + +| instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) | +| ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- | +| inf2.xlarge | 1 | 2 | 32 | 4 | 16 | 0.76 | +| inf2.8xlarge | 1 | 2 | 32 | 32 | 128 | 1.97 | +| inf2.24xlarge | 6 | 12 | 192 | 96 | 384 | 6.49 | +| inf2.48xlarge | 12 | 24 | 384 | 192 | 768 | 12.98 | + +## 1. Setup development environment + +For this tutorial, we are going to use a Notebook Instance in Amazon SageMaker with the Python 3 (ipykernel) and the `sagemaker` python SDK to deploy Llama 3.3 70B to a SageMaker inference endpoint. + +Make sur you have the latest version of the SageMaker SDK installed. + +```bash +!pip install sagemaker --upgrade --quiet +``` + +Then, instantiate the sagemaker role and session. + +```python +import sagemaker +import boto3 +sess = sagemaker.Session() +# sagemaker session bucket -> used for uploading data, models and logs +# sagemaker will automatically create this bucket if it not exists +sagemaker_session_bucket=None +if sagemaker_session_bucket is None and sess is not None: + # set to default bucket if a bucket name is not given + sagemaker_session_bucket = sess.default_bucket() + +try: + role = sagemaker.get_execution_role() +except ValueError: + iam = boto3.client('iam') + role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] + +sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) + +print(f"sagemaker role arn: {role}") +print(f"sagemaker session region: {sess.boto_region_name}") +``` + +## 2. Retrieve the latest Hugging Face TGI Neuron DLC + +The latest Hugging Face TGI Neuron DLCs can be used to run inference on AWS Inferentia2. You can use the `get_huggingface_llm_image_uri` method of the `sagemaker` SDK to retrieve the appropriate Hugging Face TGI Neuron DLC URI based on your desired `backend`, `session`, `region`, and `version`. You can find the latest version of the container [here](https://huggingface.co/docs/optimum-neuron/containers), if not yet added to the SageMaker SDK. + +```python +from sagemaker.huggingface import get_huggingface_llm_image_uri + +# retrieve the llm image uri +llm_image = get_huggingface_llm_image_uri( + "huggingface-neuronx", +) + +print(f"llm image uri: {llm_image}") +``` + +## 3. Deploy Llama 3.3 70B to Inferentia2 + +At the time of writing, [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/v2.6.0/general/arch/neuron-features/dynamic-shapes.html#neuron-dynamic-shapes), which means that we need to specify our sequence length and batch size ahead of time. +To make it easier for customers to utilize the full power of Inferentia2, we created a [neuron model cache](https://huggingface.co/docs/optimum-neuron/guides/cache_system), which contains pre-compiled configurations for the most popular LLMs, including Llama 3.3 70B. + +This means we don't need to compile the model ourselves, but we can use the pre-compiled model from the cache. You can find compiled/cached configurations on the [Hugging Face Hub](https://huggingface.co/aws-neuron/optimum-neuron-cache/tree/main/inference-cache-config). If your desired configuration is not yet cached, you can compile it yourself using the [Optimum CLI](https://huggingface.co/docs/optimum-neuron/guides/export_model) or open a request at the [Cache repository](https://huggingface.co/aws-neuron/optimum-neuron-cache/discussions). + +**Deploying Llama 3.3 70B to a SageMaker Endpoint** + +Before deploying the model to Amazon SageMaker, we must define the TGI Neuron endpoint configuration. We need to make sure the following additional parameters are defined: + +- `HF_NUM_CORES`: Number of Neuron Cores used for the compilation. +- `HF_BATCH_SIZE`: The batch size that was used to compile the model. +- `HF_SEQUENCE_LENGTH`: The sequence length that was used to compile the model. +- `HF_AUTO_CAST_TYPE`: The auto cast type that was used to compile the model. + +We still need to define traditional TGI parameters with: + +- `HF_MODEL_ID`: The Hugging Face model ID. +- `HF_TOKEN`: The Hugging Face API token to access gated models. +- `MAX_BATCH_SIZE`: The maximum batch size that the model can handle, equal to the batch size used for compilation. +- `MAX_INPUT_LENGTH`: The maximum input length that the model can handle. +- `MAX_TOTAL_TOKENS`: The maximum total tokens the model can generate, equal to the sequence length used for compilation. + +Optionnaly, you can configure the endpoint to support chat templates: +- `MESSAGES_API_ENABLED`: Enable Messages API + +**Select the right instance type** + +Llama 3.3 70B is a large model and requires a lot of memory. We are going to use the `inf2.48xlarge` instance type, which has 192 vCPUs and 384 GB of accelerator memory. The `inf2.48xlarge` instance comes with 12 Inferentia2 accelerators that include 24 Neuron Cores. If you want to find the cached configurations for Llama 3.3 70B, you can find them [here](https://huggingface.co/aws-neuron/optimum-neuron-cache/blob/main/inference-cache-config/llama3-70b.json#L16). In our case we will use a batch size of 4 and a sequence length of 4096. + + +Before we can deploy Llama 3.3 70B to Inferentia2, we need to make sure we have the necessary permissions to access the model. You can request access to the model [here](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) and create a User access token following this [guide](https://huggingface.co/docs/hub/en/security-tokens). + +After that we can create our endpoint configuration and deploy the model to Amazon SageMaker. + +```python +from sagemaker.huggingface import HuggingFaceModel + +# sagemaker config +instance_type = "ml.inf2.48xlarge" +health_check_timeout=2400 # additional time to load the model +volume_size=512 # size in GB of the EBS volume + +# Define Model and Endpoint configuration parameter +config = { + "HF_MODEL_ID": "meta-llama/Meta-Llama-3-70B-Instruct", + "HF_NUM_CORES": "24", # number of neuron cores + "HF_AUTO_CAST_TYPE": "bf16", # dtype of the model + "MAX_BATCH_SIZE": "4", # max batch size for the model + "MAX_INPUT_TOKENS": "4000", # max length of input text + "MAX_TOTAL_TOKENS": "4096", # max length of generated text + "MESSAGES_API_ENABLED": "true", # Enable the messages API + "HF_TOKEN": "", +} + +assert config["HF_TOKEN"] != "", "Please replace '' with your Hugging Face Hub API token" + +# create HuggingFaceModel with the image uri +llm_model = HuggingFaceModel( + role=role, + image_uri=llm_image, + env=config +) +``` + +After we have created the `HuggingFaceModel` we can deploy it to Amazon SageMaker using the `deploy` method. We will deploy the model with the `ml.inf2.48xlarge` instance type. TGI will automatically distribute and shard the model across all Inferentia devices. + +```python +# deactivate warning since model is compiled +llm_model._is_compiled_model = True + +llm = llm_model.deploy( + initial_instance_count=1, + instance_type=instance_type, + container_startup_health_check_timeout=health_check_timeout, + volume_size=volume_size +) +``` + +SageMaker will now create our endpoint and deploy the model to it. It takes around 30-40 minutes, we are working on improving the deployment time. + +After our endpoint is deployed we can run inference on it. We will use the `predict` method from the `predictor` to run inference on our endpoint. We can inference with different parameters to impact the generation. Parameters can be defined as in the `parameters` attribute of the payload. You can find supported parameters in the [here](https://huggingface.co/docs/text-generation-inference/messages_api). + +The Messages API allows us to interact with the model in a conversational way. We can define the role of the message and the content. The role can be either `system`,`assistant` or `user`. The `system` role is used to provide context to the model and the `user` role is used to ask questions or provide input to the model. + +```python +# Prompt to generate +messages=[ + { "role": "system", "content": "You are a helpful assistant." }, + { "role": "user", "content": "What is deep learning?" } +] + +# Generation arguments +parameters = { + "top_p": 0.6, + "temperature": 0.9, + "max_tokens": 50, + "stop": ["<|eot_id|>"], +} + +chat = llm.predict({"messages" :messages, **parameters,"steam":True}) + +print(chat["choices"][0]["message"]["content"].strip()) +``` + +## 4. Clean up + +To clean up, we can delete the model and endpoint. + +```python +llm.delete_model() +llm.delete_endpoint() +``` \ No newline at end of file diff --git a/docs/source/training_tutorials/notebooks.mdx b/docs/source/training_tutorials/notebooks.mdx index 2916d1c3f..1bc9dad2f 100644 --- a/docs/source/training_tutorials/notebooks.mdx +++ b/docs/source/training_tutorials/notebooks.mdx @@ -20,5 +20,5 @@ We prepared some notebooks for you, so that you can run directly tutorials in th | Notebook | Description | Studio Lab | |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| [Fine-tune BERT for text classification on AWS Trainium](https://github.com/huggingface/optimum-neuron/blob/main/notebooks/text-classification/notebook.ipynb) | Show how to fine-tune BERT on AWS Trainium for text classification. | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-neuron/blob/main/notebooks/text-classification/notebook.ipynb) | +| [Fine-tune BERT for text classification on AWS Trainium](https://github.com/huggingface/optimum-neuron/blob/main/notebooks/text-classification/fine_tune_bert.ipynb) | Show how to fine-tune BERT on AWS Trainium for text classification. | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-neuron/blob/main/notebooks/text-classification/fine_tune_bert.ipynb) | diff --git a/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb b/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb new file mode 100644 index 000000000..add702008 --- /dev/null +++ b/notebooks/sagemaker/deploy-llama-3-3-70b.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deploy Llama 3.3 70B on AWS Inferentia2\n", + "\n", + "In this tutorial you will learn how to deploy [/meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) model on AWS Inferentia2 with Hugging Face Optimum on Amazon SageMaker. We are going to use the Hugging Face TGI Neuron Container, a purpose-built Inference Container to easily deploy LLMs on AWS Inferentia2 powered by[ Text Generation Inference](https://huggingface.co/docs/text-generation-inference/index) and [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index).\n", + "\n", + "\n", + "We will cover how to:\n", + "1. [Setup development environment](#1-setup-development-environment)\n", + "2. [Retrieve the new Hugging Face TGI Neuron DLC](#2-retrieve-the-new-hugging-face-tgi-neuron-dlc)\n", + "3. [Deploy Llama 3.3 70B to inferentia2](#3-deploy-llama-33-70b-to-inferentia2)\n", + "4. [Clean up](#4-clean-up)\n", + "\n", + "Lets get started! 🚀\n", + "\n", + "[AWS inferentia (Inf2)](https://aws.amazon.com/de/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Here are the different instances of the Inferentia2 family.\n", + "\n", + "| instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) |\n", + "| ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- |\n", + "| inf2.xlarge | 1 | 2 | 32 | 4 | 16 | 0.76 |\n", + "| inf2.8xlarge | 1 | 2 | 32 | 32 | 128 | 1.97 |\n", + "| inf2.24xlarge | 6 | 12 | 192 | 96 | 384 | 6.49 |\n", + "| inf2.48xlarge | 12 | 24 | 384 | 192 | 768 | 12.98 |\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup development environment\n", + "\n", + "For this tutorial, we are going to use a Notebook Instance in Amazon SageMaker with the Python 3 (ipykernel) and the `sagemaker` python SDK to deploy Llama 3.3 70B to a SageMaker inference endpoint.\n", + "\n", + "Make sur you have the latest version of the SageMaker SDK installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install sagemaker --upgrade --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, instantiate the sagemaker role and session." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "import sagemaker\n", + "import boto3\n", + "sess = sagemaker.Session()\n", + "# sagemaker session bucket -> used for uploading data, models and logs\n", + "# sagemaker will automatically create this bucket if it not exists\n", + "sagemaker_session_bucket=None\n", + "if sagemaker_session_bucket is None and sess is not None:\n", + " # set to default bucket if a bucket name is not given\n", + " sagemaker_session_bucket = sess.default_bucket()\n", + "\n", + "try:\n", + " role = sagemaker.get_execution_role()\n", + "except ValueError:\n", + " iam = boto3.client('iam')\n", + " role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n", + "\n", + "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n", + "\n", + "print(f\"sagemaker role arn: {role}\")\n", + "print(f\"sagemaker session region: {sess.boto_region_name}\")\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Retrieve the latest Hugging Face TGI Neuron DLC\n", + "\n", + "The latest Hugging Face TGI Neuron DLCs can be used to run inference on AWS Inferentia2. You can use the `get_huggingface_llm_image_uri` method of the `sagemaker` SDK to retrieve the appropriate Hugging Face TGI Neuron DLC URI based on your desired `backend`, `session`, `region`, and `version`. You can find the latest version of the container [here](https://huggingface.co/docs/optimum-neuron/containers), if not yet added to the SageMaker SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "from sagemaker.huggingface import get_huggingface_llm_image_uri\n", + "\n", + "# retrieve the llm image uri\n", + "llm_image = get_huggingface_llm_image_uri(\n", + " \"huggingface-neuronx\",\n", + ")\n", + "\n", + "print(f\"llm image uri: {llm_image}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Deploy Llama 3.3 70B to Inferentia2\n", + "\n", + "At the time of writing, [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/v2.6.0/general/arch/neuron-features/dynamic-shapes.html#neuron-dynamic-shapes), which means that we need to specify our sequence length and batch size ahead of time.\n", + "To make it easier for customers to utilize the full power of Inferentia2, we created a [neuron model cache](https://huggingface.co/docs/optimum-neuron/guides/cache_system), which contains pre-compiled configurations for the most popular LLMs, including Llama 3.3 70B. \n", + "\n", + "This means we don't need to compile the model ourselves, but we can use the pre-compiled model from the cache. You can find compiled/cached configurations on the [Hugging Face Hub](https://huggingface.co/aws-neuron/optimum-neuron-cache/tree/main/inference-cache-config). If your desired configuration is not yet cached, you can compile it yourself using the [Optimum CLI](https://huggingface.co/docs/optimum-neuron/guides/export_model) or open a request at the [Cache repository](https://huggingface.co/aws-neuron/optimum-neuron-cache/discussions).\n", + "\n", + "**Deploying Llama 3.3 70B to a SageMaker Endpoint** \n", + "\n", + "Before deploying the model to Amazon SageMaker, we must define the TGI Neuron endpoint configuration. We need to make sure the following additional parameters are defined: \n", + "\n", + "- `HF_NUM_CORES`: Number of Neuron Cores used for the compilation.\n", + "- `HF_BATCH_SIZE`: The batch size that was used to compile the model.\n", + "- `HF_SEQUENCE_LENGTH`: The sequence length that was used to compile the model.\n", + "- `HF_AUTO_CAST_TYPE`: The auto cast type that was used to compile the model.\n", + "\n", + "We still need to define traditional TGI parameters with:\n", + "\n", + "- `HF_MODEL_ID`: The Hugging Face model ID.\n", + "- `HF_TOKEN`: The Hugging Face API token to access gated models.\n", + "- `MAX_BATCH_SIZE`: The maximum batch size that the model can handle, equal to the batch size used for compilation.\n", + "- `MAX_INPUT_TOKEN`: The maximum input length that the model can handle. \n", + "- `MAX_TOTAL_TOKENS`: The maximum total tokens the model can generate, equal to the sequence length used for compilation.\n", + "\n", + "Optionnaly, you can configure the endpoint to support chat templates:\n", + "- `MESSAGES_API_ENABLED`: Enable Messages API \n", + "\n", + "**Select the right instance type**\n", + "\n", + "Llama 3.3 70B is a large model and requires a lot of memory. We are going to use the `inf2.48xlarge` instance type, which has 192 vCPUs and 384 GB of accelerator memory. The `inf2.48xlarge` instance comes with 12 Inferentia2 accelerators that include 24 Neuron Cores. If you want to find the cached configurations for Llama 3.3 70B, you can find them [here](https://huggingface.co/aws-neuron/optimum-neuron-cache/blob/main/inference-cache-config/llama3-70b.json#L16). In our case we will use a batch size of 4 and a sequence length of 4096. \n", + "\n", + "\n", + "Before we can deploy Llama 3.3 70B to Inferentia2, we need to make sure we have the necessary permissions to access the model. You can request access to the model [here](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) and create a User access token following this [guide](https://huggingface.co/docs/hub/en/security-tokens).\n", + "\n", + "\n", + "After that we can create our endpoint configuration and deploy the model to Amazon SageMaker." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "from sagemaker.huggingface import HuggingFaceModel\n", + "\n", + "# sagemaker config\n", + "instance_type = \"ml.inf2.48xlarge\"\n", + "health_check_timeout=2400 # additional time to load the model\n", + "volume_size=512 # size in GB of the EBS volume\n", + "\n", + "# Define Model and Endpoint configuration parameter\n", + "config = {\n", + " \"HF_MODEL_ID\": \"meta-llama/Meta-Llama-3-70B-Instruct\",\n", + " \"HF_NUM_CORES\": \"24\", # number of neuron cores\n", + " \"HF_AUTO_CAST_TYPE\": \"bf16\", # dtype of the model\n", + " \"MAX_BATCH_SIZE\": \"4\", # max batch size for the model\n", + " \"MAX_INPUT_TOKENS\": \"4000\", # max length of input text\n", + " \"MAX_TOTAL_TOKENS\": \"4096\", # max length of generated text\n", + " \"MESSAGES_API_ENABLED\": \"true\", # Enable the messages API\n", + " \"HF_TOKEN\": \"\",\n", + "}\n", + "\n", + "assert config[\"HF_TOKEN\"] != \"\", \"Please replace '' with your Hugging Face Hub API token\"\n", + "\n", + "\n", + "# create HuggingFaceModel with the image uri\n", + "llm_model = HuggingFaceModel(\n", + " role=role,\n", + " image_uri=llm_image,\n", + " env=config\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After we have created the `HuggingFaceModel` we can deploy it to Amazon SageMaker using the `deploy` method. We will deploy the model with the `ml.inf2.48xlarge` instance type. TGI will automatically distribute and shard the model across all Inferentia devices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "# deactivate warning since model is compiled\n", + "llm_model._is_compiled_model = True\n", + "\n", + "llm = llm_model.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=instance_type,\n", + " container_startup_health_check_timeout=health_check_timeout,\n", + " volume_size=volume_size\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "SageMaker will now create our endpoint and deploy the model to it. It takes around 30-40 minutes, we are working on improving the deployment time." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After our endpoint is deployed we can run inference on it. We will use the `predict` method from the `predictor` to run inference on our endpoint. We can inference with different parameters to impact the generation. Parameters can be defined as in the `parameters` attribute of the payload. You can find supported parameters in the [here](https://huggingface.co/docs/text-generation-inference/messages_api).\n", + "\n", + "The Messages API allows us to interact with the model in a conversational way. We can define the role of the message and the content. The role can be either `system`,`assistant` or `user`. The `system` role is used to provide context to the model and the `user` role is used to ask questions or provide input to the model.\n", + "\n", + "```json\n", + "{\n", + " \"messages\": [\n", + " { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n", + " { \"role\": \"user\", \"content\": \"What is deep learning?\" }\n", + " ]\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "# Prompt to generate\n", + "messages=[\n", + " { \"role\": \"system\", \"content\": \"You are a helpful assistant.\" },\n", + " { \"role\": \"user\", \"content\": \"What is deep learning?\" }\n", + "]\n", + "\n", + "# Generation arguments\n", + "parameters = {\n", + " \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", # placholder, needed\n", + " \"top_p\": 0.6,\n", + " \"temperature\": 0.9,\n", + " \"max_tokens\": 50,\n", + " \"stop\": [\"<|eot_id|>\"],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Okay lets test it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [], + "source": [ + "chat = llm.predict({\"messages\" :messages, **parameters,\"steam\":True})\n", + "\n", + "print(chat[\"choices\"][0][\"message\"][\"content\"].strip())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Clean up\n", + "\n", + "To clean up, we can delete the model and endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm.delete_model()\n", + "llm.delete_endpoint()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hf", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "5fcf248a74081676ead7e77f54b2c239ba2921b952f7cbcdbbe5427323165924" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}