diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f44f8153..bc7a8f96 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,7 +65,7 @@ repos: - id: check-json - id: check-toml - id: check-yaml - exclude: ^deploy(\/[^\/]+)*\/templates\/.*$ + exclude: ^Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/.+$ - id: check-shebang-scripts-are-executable - id: end-of-file-fixer types_or: [c, c++, cuda, proto, textproto, java, python] diff --git a/Deployment/Kubernetes/README.md b/Deployment/Kubernetes/README.md new file mode 100644 index 00000000..72bcd0ea --- /dev/null +++ b/Deployment/Kubernetes/README.md @@ -0,0 +1,3 @@ +# Kubernetes Deployment of Triton Server Guides + +* [TensorRT-LLM Gen. AI Autoscaling & Load Balancing](./TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md new file mode 100644 index 00000000..80d99e84 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md @@ -0,0 +1,907 @@ + + +# Autoscaling and Load Balancing Generative AI w/ Triton Server and TensorRT-LLM + +Setting up autoscaling and load balancing using Triton Inference Server, TensorRT-LLM or vLLM, and Kubernetes is not difficult, +but it does require preparation. + +This guide aims to help you automated acquisition of models from Hugging Face, minimize time spent optimizing models for +TensorRT, and configuring automatic scaling and load balancing for your models. This guide does not cover Kubernetes' +basics, secure ingress/egress from your cluster to external clients, nor cloud provider interfaces or implementations of +Kubernetes. + +We'll cover the following topics: + +* [Cluster Setup](#cluster-setup) + * [Core Cluster Services](#core-cluster-services) + * [Kubernetes Node Feature Discovery service](#kubernetes-node-feature-discovery-service) + * [NVIDIA Device Plugin for Kubernetes](#nvidia-device-plugin-for-kubernetes) + * [NVIDIA GPU Feature Discovery service](#nvidia-gpu-feature-discovery-service) + * [Metrics Collection Services](#metrics-collection-services) + * [Create a Monitoring Namespace](#create-a-monitoring-namespace) + * [Prometheus Services](#prometheus-services) + * [NVIDIA Data Center GPU Manager (DCGM) Exporter](#nvidia-data-center-gpu-manager-dcgm-exporter) + * [Connect DCGM and Triton Metrics to Prometheus](#connect-dcgm-and-triton-metrics-to-prometheus) + * [Triton Metrics Prometheus Rule](#triton-metrics-prometheus-rule) + * [Hugging Face Authorization](#hugging-face-authorization) +* [Triton Preparation](#triton-preparation) + * [Model Preparation Script](#model-preparation-script) + * [Custom Container Image](#custom-container-image) + * [Kubernetes Pull Secrets](#kubernetes-pull-secrets) +* [Triton Deployment](#triton-deployment) + * [Deploying Single GPU Models](#deploying-single-gpu-models) + * [Deploying Models Too Large for a Single GPU](#deploying-models-too-large-for-a-single-gpu) + * [Utilizing Multiple GPU SKUs](#utilizing-multiple-gpu-skus) + * [Monitoring Triton in Kubernetes](#monitoring-triton-in-kubernetes) +* [Developing this Guide](#developing-this-guide) + +Prior to beginning this guide/tutorial you will need a couple of things. + +* Kubernetes Control CLI (`kubectl`) + [ [documentation](https://kubernetes.io/docs/reference/kubectl/introduction/) + | [download](https://kubernetes.io/releases/download/) ] +* Helm CLI (`helm`) + [ [documentation](https://helm.sh/) + | [download](https://helm.sh/docs/intro/install) ] +* Docker CLI (`docker`) + [ [documentation](https://docs.docker.com/) + | [download](https://docs.docker.com/get-docker/) ] +* Decent text editing software for editing YAML files. +* Kubernetes cluster. +* Fully configured `kubectl` with administrator permissions to the cluster. + +## Cluster Setup + +The following instructions are setting up Horizontal Pod Autoscaling (HPA) for Triton Server in a Kubernetes cluster. + + +### Prerequisites + +This guide assumes that all nodes with NVIDIA GPUs have the following: +- A node label of `nvidia.com/gpu=present` to more easily identify nodes with NVIDIA GPUs. +- A node taint of `nvidia.com/gpu=present:NoSchedule` to prevent non-GPU pods from being deployed to GPU nodes. + +> When using a Kubernetes provider like AKS, EKA, or GKE, it is usually best to use their interface when configuring nodes +> instead of using `kubectl` to do it directly. + + +### Core Cluster Services + +Once all nodes are correctly labeled and tainted, use the following steps to prepare the cluster to collect and serve the +necessary metrics to enable automated horizontal pod autoscaling for Triton Server. + +The following series of steps are intended to prepare a fresh cluster. +For clusters in varying states, it is best to coordinate with your cluster administrator before installing new services and +capabilities. + +#### Kubernetes Node Feature Discovery service + +1. Add the Kubernetes Node Feature Discovery chart repository to the local cache. + + ```bash + helm repo add kube-nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts \ + && helm repo update + ``` + +2. Run the command below to install the service. + + ```bash + helm install -n kube-system node-feature-discovery kube-nfd/node-feature-discovery \ + --set nameOverride=node-feature-discovery \ + --set worker.tolerations[0].key=nvidia.com/gpu \ + --set worker.tolerations[0].operator=Exists \ + --set worker.tolerations[0].effect=NoSchedule + ``` + +#### NVIDIA Device Plugin for Kubernetes + +1. This step is unnecessary if the Device Plugin has already been installed in your cluster. + Cloud provider turnkey Kubernetes clusters, such as those from AKS, EKS, and GKE, often have the Device Plugin + automatically once a GPU node as been added to the cluster. + + To check if your cluster requires the NVIDIA Device Plugin for Kubernetes, run the following command and inspect + the output for `nvidia-device-plugin-daemonset`. + + ```bash + kubectl get daemonsets -n kube-system + ``` + + Example output: + ```text + NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE + kube-proxy 6 6 6 6 6 + nvidia-device-plugin-daemonset 6 6 6 6 6 + ``` + +2. Run the command below to install the plugin. + Once installed it will provide containers access to GPUs in your clusters. + + For additional information, see + [Github/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin/blob/main/README.md). + + ```bash + kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml + ``` + +#### NVIDIA GPU Feature Discovery Service + +1. Use the YAML contents below create a file named `nvidia_gpu-feature-discovery_daemonset.yaml`. + + > [nvidia_gpu-feature-discovery_daemonset.yaml](nvidia_gpu-feature-discovery_daemonset.yaml) + + The file above was created by downloading its contents from + [GitHub/NVIDIA](https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml) + and modified specifically for Triton Server autoscaling. + + ```bash + curl https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml \ + > nvidia_gpu-feature-discovery_daemonset.yaml + ``` + +2. Then run the command below to install the + + ```bash + kubectl apply -f ./nvidia_gpu-feature-discovery_daemonset.yaml + ``` + + +### Metrics Collection Services + +Your cluster is now up, running, and can even assign GPU resources to containers. +Next, we have to setup metrics collection for DCGM and Triton Server. +Metrics provide insight to the Kubernetes Horizontal Pod Autoscaler service and enable it to make autoscaling decisions based +on the utilization and availability of deployed models. + +#### Create a Monitoring Namespace + +Create the `monitoring` namespace in your cluster for all of the metrics and monitoring services. + +1. Run the command below to create the namespace. + + ```bash + kubectl create namespace monitoring + ``` + +#### Prometheus Services + +We a service to collect, store, aggregate, and provide metrics collected from your cluster and the services deployed in it. +One of the easiest ways to do this is to leverage the functionality of the [Prometheus Metrics Server](https://prometheus.io/). +Using the following steps, we'll install the Prometheus Stack for Kubernetes Helm chart so that we can leverage Prometheus. + +1. Add the Prometheus Community chart repository to the local cache. + + ```bash + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts \ + && helm repo update + ``` + +2. Run the command below to install the Prometheus Kubernetes Stack Helm chart. + + ```bash + helm install -n monitoring prometheus prometheus-community/kube-prometheus-stack \ + --set tolerations[0].key=nvidia.com/gpu \ + --set tolerations[0].operator=Exists \ + --set tolerations[0].effect=NoSchedule + ``` + +#### NVIDIA Data Center GPU Manager (DCGM) Exporter + +The best solution for management of GPUs in your cluster is +[NVIDIA DCGM](https://docs.nvidia.com/data-center-gpu-manager-dcgm)(DCGM). +However, for this example we do not need the entirety of the DCGM stack. +Instead, we'll use the steps below to install the [DCGM Exporter](https://github.com/NVIDIA/dcgm-exporter) to enable the +collection of GPU metrics in your cluster. + +1. Add the NVIDIA DCGM chart repository to the local cache. + + ```bash + helm repo add nvidia-dcgm https://nvidia.github.io/dcgm-exporter/helm-charts \ + && helm repo update + ``` + +2. Use the YAML contents below to create a file named `nvidia_dcgm-exporter_values.yaml`. + + > [nvidia_dcgm-exporter_values.yaml](nvidia_dcgm-exporter_values.yaml) + + The contents above were generated using `helm show values nvidia-dcgm/dcgm-exporter` and then modified specifically for + Triton Server autoscaling. + +4. Install the DCGM Exporter Helm chart using the following command. + + ```bash + helm install -n monitoring dcgm-exporter nvidia-dcgm/dcgm-exporter --values nvidia_dcgm-exporter_values.yaml + ``` + +#### Connect DCGM and Triton Metrics to Prometheus + +We need to provide a mechanism that will scrape the metrics produced by DCGM Exporter and inject them into the Prometheus +metrics server. +The steps below will setup a Prometheus Adapter that collects metrics from every DCGM Exporter worker and provides them to +Prometheus. + +1. Run the command below to install the Prometheus Adapter Helm chart. + + ```bash + helm install -n monitoring prometheus-adapter prometheus-community/prometheus-adapter \ + --set metricsRelistInterval=6s \ + --set customLabels.monitoring=prometheus-adapter \ + --set customLabels.release=prometheus \ + --set prometheus.url=http://prometheus-kube-prometheus-prometheus \ + --set additionalLabels.release=prometheus + ``` + +2. To verify that the adapter is installed and configured correctly, wait for at least 60 seconds and then run the + following command. + It is important to note that here is a noticeable delay between the adapter being installed and the availability of custom + metrics. + + ```bash + kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 + ``` + + If the command fails, wait longer and retry. If the command fails for more than a few minutes then the adapter is + misconfigured and will require intervention. + + +#### Triton Metrics Prometheus Rule + +Prometheus rules provide a mechanism for generation of metrics data using a formula that operates on data being collected by +Prometheus. +We'll create a set of rules specific to Triton Server which generate metrics useful for autoscaling. + +1. Use the YAML contents below to create a file named `triton-metrics_prometheus-rule.yaml`. + + > [triton-metrics_prometheus-rule.yaml](triton-metrics_prometheus-rule.yaml) + +2. Run the following command to create the necessary Prometheus Rule in the cluster. _Note_ that the rule will be created + in the namespace of your current context, usually `default`. + If you prefer to install it in a different namespace you can either update your context or add `-n ` to + the command. + + ```bash + kubectl apply -f ./triton-metrics_prometheus-rule.yaml + ``` + +In all of value files for the example Helm chart, the horizontal-pod autoscaler is configured to use the +`triton:queue_compute:ratio` metric provide by the above rules. +The benefit of using this metric is that it is hardware and model independent since it measures the ratio between the time a +request spends in the inference queue to the time it takes to complete once it has left the queue. +This kind of metric allows he performance of models on diverse hardware to compared to each other. + +If absolute response times are a more important metric the `triton:request_duration:average` or +`triton:compute_duration:average` metrics would more likely meet this requirement. + + +### Hugging Face Authorization + +In order to download models from Hugging Face, your pods will require an access token with the appropriate permission to +download models from their servers. + +1. If you do not already have a Hugging Face access token, you will need to created one. + To create a Hugging Face access token, + [follow their guide](https://huggingface.co/docs/hub/en/security-tokens). + +2. Once you have a token, use the command below to persist the token as a secret named `hf-model-pull` in your cluster. + + ```bash + kubectl create secret generic hf-model-pull '--from-literal=password=' + ``` + +3. To verify that your secret has been created, use the following command and inspect the output for your secret. + + ```bash + kubectl get secrets + ``` + + +## Triton Preparation + +### Model Preparation Script + +This script will executed by every pod created for a model deployment as part of the pod's initialization phase +(i.e. before Triton Server is started). + +The intention of this script to handle the acquisition of the model file from Hugging Face, the generation of the TensorRT +engine and plan files, and the caching of said generated files. +The script depends on the fact that the Kubernetes deployment scripts we'll be using rely on host storage (caching files on +the nodes themselves). + +Specially, the model and engine directories will me mapped to folders on the host node and remapped to all subsequent +pods deployed on the same node. +This enables the generation script to detect that the plan and engine generation steps have been completed and not repeat work. + +When Triton Server is started, the same host folders will be mounted to its container and Triton will use the pre-generated +model plan and engine files. +This drastically reduces the time required for subsequent pod starts on the same node. + +1. Create a Python file with the content below named `server.py`. + + > [server.py](containers/server.py) + + This solution could be further improved by adding a network storage location shared by all nodes in a cluster that + could be used to globally cache per model/GPU plan and engine files. + Subsequent pod starts on new nodes with the same GPU could download the pregenerated files instead generating them + locally. + This could save significant sime depending on the delta between the time to download the files instead of generating them + (likely several seconds at least). + +#### Custom Container Image + +1. Create a container file with the content below named `triton_trt-llm.containerfile`. + + > [triton_trt-llm.containerfile](containers/server.containerfile) + +2. Run the following command to create a custom Triton Inference Server w/ all necessary tools to generate TensorRT-LLM + plan and engine files. In this example we'll use the tag `24.04` to match the date portion of `24.04-trtllm-python-py3` + from the base image. + + ```bash + docker build \ + --file ./triton_trt-llm.containerfile \ + --rm \ + --tag triton_trt-llm:24.04 \ + . + ``` + + ##### Custom Version of Triton CLI + + This custom Triton Server container image makes use of a custom version of the Triton CLI. + The relevant changes have been made available as a + [topic branch](https://github.com/triton-inference-server/triton_cli/tree/jwyman/aslb-mn) in the Triton CLI repository on + GitHub. + The changes in the branch can be + [inspected](https://github.com/triton-inference-server/triton_cli/compare/main...jwyman/aslb-mn) using the GitHub + interface, and primarily contain the addition of the ability to specify tensor parallelism when optimizing models for + TensorRT-LLM and enable support for additional models. + +3. Upload the Container Image to a Cluster Visible Repository. + + In order for your Kubernetes cluster to be able to download out new container image, it will need to be pushed to a + container image repository that nodes in your cluster can reach. + In this example, we'll use the `nvcr.io/nvaie/staging` repository for demonstration purposes. + You will need to determine which repositories you have write access to that your cluster can also access. + + 1. First, re-tag the container image with the repository's name like below. + + ```bash + docker tag \ + triton_trt-llm:24.04 \ + nvcr.io/nvaie/staging/triton_trt-llm:24.04 + ``` + + 2. Next, upload the container image to your repository. + + ```bash + docker push nvcr.io/nvaie/staging/triton_trt-llm:24.04 + ``` + +#### Kubernetes Pull Secrets + +If your container image repository requires credentials to download images from, then you will need to create a Kubernetes +docker-registry secret. +We'll be using the `nvcr.io` container image repository example above for demonstration purposes. +Be sure to properly escape any special characters such as `$` in the password or username values. + +1. Use the command below to create the necessary secret. Secrets for your repository should be similar, but not be identical +to the example below. + + ```bash + kubectl create secret docker-registry ngc-container-pull \ + --docker-password='dGhpcyBpcyBub3QgYSByZWFsIHNlY3JldC4gaXQgaXMgb25seSBmb3IgZGVtb25zdHJhdGlvbiBwdXJwb3Nlcy4' \ + --docker-server='nvcr.io' + --docker-username='\$oauthtoken' + ``` + +2. The above command will create a secret in your cluster named `ngc-container-pull`. + You can verify that the secret was created correctly using the following command and inspecting its output for the secret + you're looking for. + + ```bash + kubectl get secrets + ``` + +3. Ensure the contents of the secret are correct, you can run the following command. + + ```bash + kubectl get secret/ngc-container-pull -o yaml + ``` + + You should see an output similar to the following. + + ```yaml + apiVersion: v1 + data: + .dockerconfigjson: eyJhdXRocyI6eyJudmNyLmlvIjp7InVzZXJuYW1lIjoiJG9hdXRodG9rZW4iLCJwYXNzd29yZCI6IlZHaHBjeUJwY3lCdWIzUWdZU0J5WldGc0lITmxZM0psZEN3Z2FYUWdhWE1nYjI1c2VTQm1iM0lnWkdWdGIyNXpkSEpoZEdsdmJpQndkWEp3YjNObGN5ND0iLCJhdXRoIjoiSkc5aGRYUm9kRzlyWlc0NlZrZG9jR041UW5CamVVSjFZak5SWjFsVFFubGFWMFp6U1VoT2JGa3pTbXhrUTNkbllWaFJaMkZZVFdkaU1qVnpaVk5DYldJelNXZGFSMVowWWpJMWVtUklTbWhrUjJ4MlltbENkMlJZU25kaU0wNXNZM2swWjFWSGVHeFpXRTVzU1VjMWJHUnRWbmxKU0ZaNldsTkNRMWxZVG14T2FsRm5aRWM0WjJGSGJHdGFVMEo1V2xkR2MwbElUbXhaTTBwc1pFaE5hQT09In19fQ== + kind: Secret + metadata: + name: ngc-container-pull + namespace: default + type: kubernetes.io/dockerconfigjson + ``` + + The value of `.dockerconfigjson` is a base-64 encoded string which can be decoded into the following. + + ```json + { + "auths": { + "nvcr.io": { + "username":"$oauthtoken", + "password":"VGhpcyBpcyBub3QgYSByZWFsIHNlY3JldCwgaXQgaXMgb25seSBmb3IgZGVtb25zdHJhdGlvbiBwdXJwb3Nlcy4gUGxlYXNlIG5ldmVyIHVzZSBCYXNlNjQgdG8gaGlkZSByZWFsIHNlY3JldHMh", + "auth":"JG9hdXRodG9rZW46VkdocGN5QnBjeUJ1YjNRZ1lTQnlaV0ZzSUhObFkzSmxkQ3dnYVhRZ2FYTWdiMjVzZVNCbWIzSWdaR1Z0YjI1emRISmhkR2x2YmlCd2RYSndiM05sY3k0Z1VHeGxZWE5sSUc1bGRtVnlJSFZ6WlNCQ1lYTmxOalFnZEc4Z2FHbGtaU0J5WldGc0lITmxZM0psZEhNaA==" + } + } + } + ``` + + The values of `password` and `auth` are also base-64 encoded string. + We recommend inspecting the values of the following values: + + * Value of `.auths['nvcr.io'].username`. + * Base64 decoded value of `.auths['nvcr.io'].password`. + * Base64 decoded value of `.auths['nvcr.io'].auths`. + + +## Triton Deployment + +### Deploying Single GPU Models + +Deploying Triton Server with a model that fits on a single GPU is straightforward using the steps below. + +1. Create a custom values file with required values: + + * Container image name. + * Model name. + * Supported / available GPU(s). + * Image pull secrets (if necessary). + * Hugging Face secret name. + + The provided sample Helm [chart](./chart/) include several example values files such as + [llama-3-8b_values.yaml](client/llama-3-8b-instruct_values.yaml). + +2. Deploy LLM on Triton + TRT-LLM. + + Apply the custom values file to override the exported base values file using the command below, and create the Triton + Server Kubernetes deployment. + + _Note: the order that the values files are specified on the command line is important with values are applied and + override existing values in the order they are specified._ + + ```bash + helm install \ + --values ./chart/values.yaml \ + --values ./chart/.yaml \ + --set 'triton.image.name=' \ + ./chart/. + ``` + + _Be sure to substitute the correct values for `` and `` in the example above._ + +3. Verify the Chart Installation. + + Use the following commands to inspect the installed chart and to determine if everything is working as intended. + + ```bash + kubectl get deployments,pods,hpa,services,podmonitors --selector='app=' + ``` + + _Be sure to substitute the correct value for `` in the example above._ + + You should output similar to below (assuming the installation name of "llama-3"): + + ```text + NAME READY UP-TO-DATE AVAILABLE + deployment.apps/llama-3 0/1 1 0 + + NAME READY STATUS RESTARTS + pod/llama-3-7989ffd8d-ck62t 0/1 Pending 0 + + NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS + horizontalpodautoscaler.autoscaling/llama-3 Deployment/llama-3 0/1 1 8 1 + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) + service/llama-3 ClusterIP 10.100.23.237 8000/TCP,8001/TCP,8002/TCP + + NAME + podmonitor.monitoring.coreos.com/llama-3 + ``` + + HPA `TARGETS` might show as `/1`. + This is not necessarily an issue. It is most likely caused by a lack of client applications sending inference queries to + Triton Server. + Without inference queries, there are no metrics generated and thus the HPA controller reports the metric's current values + as ``. + This issue should resolve itself once clients begin sending inference queries to Triton Server. + +4. Uninstalling the Chart + + Uninstalling a Helm chart is as straightforward as running the command below. + This is useful when experimenting with various options and configurations. + + ```bash + helm uninstall + ``` + + +### Deploying Models Too Large for a Single GPU + +Given the memory requirements of some AI models it is not possible to host them using a single device. +Triton and TensorRT-LLM provide a mechanism to enable a large model to be hosted by multiple GPU devices working in concert. +The provided sample Helm [chart](./chart/) provides a mechanism for taking advantage of this capability. + +To enable this feature, adjust the `model.tensorrtLlm.parallelism.tensor` value to an integer greater than 1. +Configuring a model to use tensor parallelism enables the TensorRT-LLM runtime to effectively combine the memory of multiple +GPUs to host a model too large to fit on a single GPU. + +Similarly, changing the value of `model.tensorrtLlm.parallelism.tensor` will enable pipeline parallelism. +Pipeline parallelism is used to combine the compute capacity of multiple GPUs to process inference requests in parallel. + +The number of GPUs required to host the model is equal to product of the values of `.tensor` and `.pipeline`. +It is important to note that the GPUs used to host a model must reside on the same node. + +_Combining GPUs which reside on separate nodes is not covered in this guide._ + + +### Utilizing Multiple GPU SKUs + +Given the relative limited availability of certain SKUs of GPU, it is not uncommon for services to be required to operate on a +mix of GPU hardware. +For example, the number of nodes with NVIDIA Hopper based devices might be insufficient to me load requirements and your +clusters have spare nodes with NVIDIA Ampere based devices. +In this scenario, it would make sense to create multiple deployment of the same model using the steps +[above](#deploying-single-gpu-models) and placing them all behind a single Kubernetes service for load-balancing needs. +Doing so will enable both SKU of devices to automatically scale independently and provide compute capacity for the service. + +To achieve this, we can update the chart to not create a service with our deployment and to include the selector labels +specified by the shared service. +In the example below, we'll assume the service has already been created and its selector is set to `model=llama-3-8b`. + +```bash +helm install llama-3-8b-a100 ./chart/. \ + --values ./chart/values.yaml \ + --values ./chart/llama-3-8b \ + --set 'triton.image.name=' \ + --set 'gpu[0]=NVIDIA-A100-SXM4-40GB' \ + --set 'kubernetes.labels[0].model=llama-3-8b' \ + --set 'kubernetes.noService=true' + +helm install llama-3-8b-h100 ./chart/. \ + --values ./chart/values.yaml \ + --values ./chart/llama-3-8b \ + --set 'triton.image.name=' \ + --set 'gpu[0]=NVIDIA-H100-SXM5-80GB' \ + --set 'kubernetes.labels[0].model=llama-3-8b' \ + --set 'kubernetes.noService=true' +``` + +The result will be two deployments in your cluster, both of which are part of your service's load-balancing pool. + +```bash +kubectl get deployments --selector='model=llama-3-8b' +NAME READY UP-TO-DATE AVAILABLE +llama-3-8b-a100 1/1 1 1 +llama-3-8b-h100 1/1 1 1 +``` + + +### Monitoring Triton in Kubernetes + +Monitoring Triton in Kubernetes can be done using the Prometheus software installed as part of the +[Prometheus Services](#prometheus-services) second of this document. +The installed software includes a Grafana dashboard server. +To connect to the Grafana server, we first need to create a networking tunnel from your local workstation into you cluster. + +1. Run the following command to create a networking tunnel from a local machine into the Kubernetes cluster. + + ```bash + kubectl port-forward -n monitoring svc/prometheus-grafana 8080:80 + ``` + + This creates a tunnel from port `8080` on your local machine to the Grafana server in the cluster on port `80`. + When successful, you should see output that looks something like the example below. + + ```bash + Forwarding from 127.0.0.1:8080 -> 3000 + Forwarding from [::1]:8080 -> 3000 + ``` + +2. Open a web browser and enter `http://http://127.0.0.1:8080/` into the address bar. + +3. The first time you do this, you will need to login to Grafana. + Use the following username and password to complete the login. + + * Username: `admin` + * Password: `prom-operator` + + _The above the default username and password for Grafana when it is installed as part of the Prometheus Helm chart._ + +4. The first thing we'll want to do is to create a new custom dashboard. + To do this, click on the `+` icon in the upper-right of the user interface and select `New dashboard` from the dropdown menu. + + ![Visualization of the "new dashboard" interface](./images/grafana_new-dashboard.png) + +5. Grafana will prompt you as to how you want to create a new dashboard. + Select the `Import dashboard` option. + + ![Visualization of the "new dashboard" interface](./images/grafana_import-dashboard.png) + +6. Either copy the content from the provided [grafana_inference-metrics_dashboard.json](./grafana_inference-metrics_dashboard.json) file and + paste it into the text box named `Import via dashboard JSON model`, or upload the file using the user interfaces + `Upload dashboard JSON file` tool. + +7. Once you've created the new dashboard, you should see something that looks like the image below. + + ![Example Grafana dashboard created by following the above instructions.](./images/grafana-dashboard.png) + +Once the dashboard has been setup, you will be able to visualize the current state of your cluster. +These visualizations can provide insight into why we've chosen to use the queue:compute ratio instead of GPU utilization as +the metric used to control the behavior of the horizontal pod autoscaler. + +| GPU Utilization | Queue-to-Compute Ratio | +| -------------------------------------------------------------------- | --------------------------------------------------------------------------- | +| ![Example GPU utilization graph](./images/graph_gpu-utilization.png) | ![Example queue:compute ratio graph](./images/graph_queue-compute-ratio.png) | + +The above graphs are over the same period of time. +Comparing the two clearly shows that the ratio graph is a cleaner indication of when additional resources are necessary to +meet current inference demands, whereas the GPU utilization graph contains too much noise to produce a clear signal for the +horizontal pod autoscaler to operate on. + + +## Developing this Guide + +During the development of this guide, I ran into several problems that needed to be solved before we could provide a useful +guide. +This section will outline and describe the issues I ran into and how we resolved them. + +> _This document was developed using a Kubernetes cluster provided by Amazon EKS._ +> _Clusters provisioned on-premises or provided by other cloud service providers such as Azure AKS or GCloud GKE might require_ +> _modifications to this guide._ + +### Metrics Configuration is as Much an Art as a Science + +During the development of this guide I spent an inordinate amount of time figuring out every variable, setting, and +configuration required to get all of the necessary and useful metrics. +Much of the effort was spent on discovering the intricacies of Kubernetes' HPA controller and how it consumed metrics. + +Initially, I was unable to get the HPA controller to recognize the custom metrics I wanted to use to control pod autoscaling. +Ultimately I discovered that the v2 HPA controller had been automatically configured when +[Prometheus Stack for Kubernetes](#prometheus-adapter-for-kubernetes) was installed to use the `custom.metrics.k8s.io/v1beta1` +endpoint provided by Prometheus. + +Run the following command to retrieve the set of metrics provided by the `custom.metrics.k8s.io/v1beta1` endpoint. + +```bash +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 +``` + +The above will return a JSON blob which can be inspected in your favorite IDE. +I recommend VSCode because it handles JavaScript and JSON very well, but use the tool that suits you best. + +Current metrics values can be queries from the endpoint using a command like below. + +```bash +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/pod/*/triton:queue_compute:ratio +``` + +The above command requests the `triton:queue_compute:ratio` metric for all pods in the `default` namespace. +This is almost exactly what Kubernetes v2 HPA controller will do to query the metrics it needs to make autoscaling decisions. +Once I knew this, I was able to experiment with configuration values in the Prometheus and Prometheus Adapter installations as +well as in the Prometheus Rule we created in the above guide until everything "just worked". + + +### Why This Set of Software Components? + +The set of software packages described in this document is close the minimum viable set of packages without handcrafting +custom Helm charts and YAML files for every package and dependency. +Is this the only set of packages and components that can be used to make this solution work? +Definitely not, there are several alternatives which could meet our requirements. +This set of packages and components is just the set I happen to choose for this guide. + +Below is a high-level description of why each package is listed in this guide. + +#### NVIDIA Device Plugin for Kubernetes + +Required to enable GPUs to be treated as resources by the Kubernetes scheduler. +Without this component, GPUs would not be assigned to containers correctly. + +#### NVIDIA GPU Discovery Service for Kubernetes + +Provides automatic labelling of Kubernetes nodes based on the NVIDIA devices and software available on the node. +Without the provided labels, it would not be possible to specify specific GPU SKUs when deploying models because the +Kubernetes scheduler treats all GPUs as identical (referring to them all with the generic resources name `nvidia.com/gpu`). + +#### Kubernetes Node Discovery Service + +This is a requirement for the [NVIDIA GPU Discovery Service for Kubernetes](#nvidia-gpu-discovery-service-for-kubernetes). + +#### NVIDIA DCGM Exporter + +Provides hardware monitoring and metrics for NVIDIA GPUs and other devices present in the cluster. +Without the metrics this provides, monitoring GPU utilization, temperature and other metrics would not be possible. + +While Triton Server has the capability to collect and serve NVIDIA hardware metrics, relying on Triton Server to provide this +service is non-optimal for several reasons. + +Firstly, many processes on the same machine querying the NVIDIA device driver for current state, filtering the results for +only values that pertain to the individual process, and serving them via Triton's open-metrics server is as wasteful as the +the number of Triton Server process beyond the first on the node. + +Secondly, due to the need to interface with the kernel-mode driver to retrieve hardware metrics queries get serialized adding +additional overhead and latency to the system. + +Finally, the rate at which metrics are collected from Triton Server is not the same as the rate at which metrics are collected +from the DCGM Exporter. +Separating the metrics collection from Triton Server allows for customized metric collection rates, which enables us to +further minimize the process overhead placed on the node. + +##### Why is the DCGM Exporter Values File Custom? + +I decided to use a custom values file when installing the DCGM Exporter Helm chart for several reasons. + +Firstly, it is my professional opinion that every container in a cluster should specify resource limits and requests. +Not doing so opens the node up to a number of difficult to diagnose failure conditions related to resource exhaustion. +Out of memory errors are the most obvious and easiest to root cause. +Additionally, difficult to reproduce, transient timeout and timing errors caused CPU over subscription can easily happen when +any container is unconstrained and quickly waste an entire engineering team's time as they attempt to triage, debug, and +resolve them. + +Secondly, the DCGM Exporter process itself spams error logs when it cannot find NVIDIA devices in the system. +This is primarily because the service was originally created for non-Kubernetes environments. +Therefore I wanted to restrict which node the exporter would get deployed to. +Fortunately, the DCGM Helm chart makes this easy by support node selector options. + +Thirdly, because nodes with NVIDIA GPUs have been tainted with the `nvidia.com/gpu=present:NoSchedule` that prevents any +pod which does not explicitly tolerate the taint from be assigned to the node, I need to add the tolerations to the DCGM +Exporter pod. + +Finally, the default Helm chart for DCGM Exporter is missing the required `--kubernetes=true` option being passed in via +command line options when the process is started. +Without this option, DCGM Exporter does not correctly associate hardware metrics with the pods actually using it, and +there would be mechanism for understand how each pod uses the GPU resources assigned to it. + +#### Prometheus Stack for Kubernetes + +Provides metrics collection and aggregation services for the cluster. +While there are other tools capable of providing similar services, we found the Prometheus Stack for Kubernetes was the +easiest to install and configure. +Additionally, the automatic inclusion of a Grafana based user interface made visualization of the cluster's current health +easier to setup. + +Out initial work on this document were based on another metrics service, but we found the configuration of metrics collection +from Triton Server and the use of custom metrics to drive horizontal pod autoscaling overly difficult and confusing. + +#### Prometheus Adapter for Kubernetes + +Provides metrics collection from non-standard metrics providers, like Triton Server, which is a requirement when leveraging +custom metrics as described in this document. + +##### Why the Custom Values File for Prometheus Adapter? + +I created a custom values file for Prometheus Adapter for very similar reasons to why I created a custom values file for DCGM +Exporter. +Taints and tolerations, optimized values for metrics collection, and the necessity of providing the correct URL to the +deployed Prometheus server. + +#### Why Use the Triton CLI and Not Other Tools Provided by NVIDIA? + +I chose to use the new [Triton CLI](https://github.com/triton-inference-server/triton_cli) tool to optimize models for +TensorRT-LLM instead of other available tools for a couple of reasons. + +Firstly, using the Triton CLI simplifies the conversion and optimization of models into a single command. + +Secondly, relying on the Triton CLI simplifies the creation of the container because all requirements were met with a single +`pip install` command. + +##### Why Use a Custom Branch of Triton CLI Instead of an Official Release? + +I decided to use a custom [branch of Triton CLI](https://github.com/triton-inference-server/triton_cli/tree/jwyman/aslb-mn) +because there are features this guide needed that were not present in any of the official releases available. +The branch is not a Merge Request because the method used to add the needed features does not aligned with changes the +maintainers have planned. +Once we can achieve alignment, this guide will be updated to use an official release. + + +### Why Does the Chart Run a Python Script Instead of Triton Server Directly? + +There are two reasons: + +1. In order to retrieve a model from Hugging Face, convert and optimize it for TensorRT-LLM, and cache it on the host decided + that [pod initialization container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) was the most + straightforward solution. + + In order to make the best use of the initialization container I chose to use a custom [server.py](./containers/server.py) + script that made of the new [Triton CLI](https://github.com/triton-inference-server/triton_cli) tool. + +2. Multi-GPU deployments require a rather specialized command line, and generating it using Helm chart scripting was not + something I wanted to deal with. + Leveraging the custom Python script was the logical, and easiest, solution. + +#### Why is the Python Written Like That? + +Because I'm not a Python developer, but I am learning! +My background is in C/C++ with plenty of experience with shell scripting languages. + + +### Why Use a Custom Triton Image? + +I decided to use a custom image for a few reasons. + +1. Given the answer above and the use of Triton CLI and a custom Python script, the initialization container needed both + components pre-installed in it to avoid unnecessary use of ephemeral storage. + + _Use of ephemeral storage can lead to pod eviction, and therefore should be avoided whenever possible._ + +2. Since the Triton + TRT-LLM image is already incredibly large, I wanted to avoid consuming additional host storage space + with yet another container image. + + Additionally, the experience of a pod appearing to be stuck in the `Pending` state while it download a container prior to + the initialization container is easier to understand compared to a short `Pending` state before the initialization + container, followed by a much longer `Pending` state before the Triton Server can start. + +3. I wanted custom, a constant environment variable set for `ENGINE_DEST_PATH` that could be used by both the initialization + and Triton Server containers. + + +### What is the `client/` Folder For? + +I decided to include the tools I used to validate this guide, and the deployment definitions in the `client/` folder are a key +piece of that effort. +You can use them yourself, if you want to. +All that is required is to run (example for `llama-3-8b`) `kubectl apply -f ./clients/llama-3-8b.yaml` to create the +deployment followed by `kubectl scale deployment/llama-3-8b --replicas=`. + +As you increase the number of clients generating inference requests for a given Triton Server deployment, load will increase +on the server and the queue-to-compute ratio will eventually cause the horizontal pod autoscaler to increase the number of +Triton Server instancing handle requests until the desired ratio is achieved. + +Decreasing the number fo clients will have the inverse effect and reduce the number of Triton Server instances deployed. + +Note that it is important to use the `containers/client.containerfile` to build a client container image before attempting to +create a client deployment in your cluster. +Just like when building the `containers/server.containerfile`, the image will need to hosted somewhere the cluster's machines +are able to download it from. + + +### Why Doesn't this Guide Include Load Balancer Instructions? + +Experiments with specialized load balancers, that can utilize pod metrics to determine which instance of Triton Server is the +best instance to send new work to, showed modest-at-best improvements over the "round robin" system provided by the +Kubernetes' networking layer via [kube-proxy](https://kubernetes.io/docs/reference/command-line-tools-reference/kube-proxy/). +Since kube-proxy is required for every network operation in a cluster anyways, leveraging the existing solution was a more +optimal solution because it avoided adding even more complexity without a justifiable value. + +Results in your environment could very well be different. +I encourage you to experiment with specialized load balancers to determine the best solution for your workloads. + +--- + +Software versions featured in this document: + +* Triton Inference Server v2.45.0 (24.04-trtllm-python-py3) +* TensorRT-LLM v0.9.0 +* Triton CLI v0.0.7 +* NVIDIA Device Plugin for Kubernetes v0.15.0 +* NVIDIA GPU Discovery Service for Kubernetes v0.8.2 +* NVIDIA DCGM Exporter v3.3.5 +* Kubernetes Node Discovery Service v0.15.4 +* Prometheus Stack for Kubernetes v58.7.2 +* Prometheus Adapter for Kubernetes v4.10.0 + +--- + +Author: J Wyman, System Software Architect, AI & Distributed Systems + +Copyright © 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml new file mode 100644 index 00000000..af68a7be --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +appVersion: 0.1.0 +description: Triton + TensorRT-LLM autoscaling and load balancing example. +icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png +name: triton_trt-llm_aslb-example +version: 0.1.0 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml new file mode 100644 index 00000000..f303717b --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +gpu: +- Tesla-T4 +- Tesla-V100-SXM2-16GB + +model: + name: gpt2 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml new file mode 100644 index 00000000..5dbb8f91 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: +- Tesla-V100-SXM2-16GB + +model: + name: llama-2-7b-chat + pullSecret: hf-model-pull + tensorrtLlm: + parallelism: + tensor: 2 + +autoscaling: + metric: + value: 1500m diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml new file mode 100644 index 00000000..9679275b --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: +- NVIDIA-A10G +- NVIDIA-A100-SXM4-40GB + +model: + name: llama-2-7b + pullSecret: hf-model-pull diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml new file mode 100644 index 00000000..71518751 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: +- NVIDIA-A100-SXM4-40GB + +model: + name: llama-3-70b-instruct + pullSecret: hf-model-pull + tensorrtLlm: + parallelism: + tensor: 8 + +autoscaling: + metric: + value: 3500m diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml new file mode 100644 index 00000000..439fe135 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: +- Tesla-V100-SXM2-16GB + +model: + name: llama-3-8b-instruct + pullSecret: hf-model-pull + tensorrtLlm: + parallelism: + tensor: 2 + +autoscaling: + metric: + value: 1500m diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml new file mode 100644 index 00000000..1151b32a --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: +- NVIDIA-A10G +- NVIDIA-A100-SXM4-40GB + +model: + name: llama-3-8b + pullSecret: hf-model-pull diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml new file mode 100644 index 00000000..8d368149 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: +- Tesla-V100-SXM2-16GB +- Tesla-T4 + +model: + name: opt125m + pullSecret: hf-model-pull diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt new file mode 100644 index 00000000..a2ee0a72 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt @@ -0,0 +1,12 @@ +{{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete. + +Release Name: {{ $.Release.Name }} +Namespace: {{ $.Release.Namespace }} +Deployment Name: {{ $.Release.Name }} +Service Name: {{ $.Release.Name }} + +Helpful commands: + + $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }} + $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }} + $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments,pods,hpa,services,podmonitors diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml new file mode 100644 index 00000000..c6515a40 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml @@ -0,0 +1,324 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- $hostRootPath := "/triton" }} +{{- $image_name := "" }} +{{- with $.Values.triton }} + {{- with .image }} + {{- $image_name = required "Property '.triton.image.name' is required." .name }} + {{- else }} + {{- fail "Property '.triton.image' is required." }} + {{- end }} +{{- else }} + {{- fail "Property '.triton' is required" }} +{{- end }} +{{- $model_name := "" }} +{{- $model_dt := "float16" }} +{{- $model_pp := 1 }} +{{- $model_tp := 1 }} +{{- $model_trtllm := true }} +{{- with $.Values.kubernetes }} + {{- with .hostRootPath }} + {{- $hostRootPath = . }} + {{- end }} +{{- end }} +{{- with $.Values.model }} + {{- $model_name = required "Property '.model.name' is required." .name }} + {{- with .tensorrtLlm }} + {{- $model_trtllm = .enable }} + {{- with .dataType }} + {{- $model_dt = . }} + {{- end }} + {{- with .parallelism }} + {{- with .pipeline }} + {{- $model_pp = (int .) }} + {{- end }} + {{- with .tensor }} + {{- $model_tp = (int .) }} + {{- end }} + {{- end }} + {{- end }} +{{- else }} + {{- fail "Property '.model' is required." }} +{{- end }} +{{- $model_lower := lower $model_name }} +{{- $model_upper := upper $model_name }} +{{- $model_gpus := mul $model_pp $model_tp }} +{{- $triton_cpu := 4 }} +{{- if lt $triton_cpu $model_gpus }} + {{- $triton_cpu = $model_gpus }} +{{- end }} +{{- $triton_memory := printf "%dGi" (mul $triton_cpu 8) }} +{{- with $.Values.triton }} + {{- with .image }} + {{- with .name }} + {{- $image_name = . }} + {{- end }} + {{- end }} + {{- with .resources }} + {{- with .cpu }} + {{- $triton_cpu = (int .) }} + {{- end }} + {{- with .memory }} + {{- $triton_memory = . }} + {{- end }} + {{- end }} +{{- end }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} +{{- with $.Values.kubernetes }} + {{- with .labels }} +{{ toYaml . | indent 4 }} + {{- end }} +{{- end }} +spec: + selector: + matchLabels: + app: {{ $.Release.Name }} + replicas: 1 + template: + metadata: + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: server +{{- with $.Values.kubernetes }} + {{- with .labels }} +{{ toYaml . | indent 8 }} + {{- end }} +{{- end }} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Exists +{{- with $.Values.gpu }} + - key: nvidia.com/gpu.product + operator: In + values: +{{ toYaml . | indent 16 }} +{{- else }} + {{- fail "Property '.gpu' is required." }} +{{- end }} + containers: + - name: triton + command: + - python3 + - ./server.py + - exec +{{- if $model_trtllm }} + - --engine=trtllm + - --dt={{ $model_dt }} + - --pp={{ $model_pp }} + - --tp={{ $model_tp }} +{{- else }} + - --engine=vllm +{{- end }} +{{- with $.Values.logging }} + {{- with .tritonServer }} + {{- if .useIso8601 }} + - --iso8601 + {{- end }} + {{- if .verbose }} + - --verbose + {{- end }} + {{- end }} +{{- end }} + env: + - name: ENGINE_DEST_PATH + value: /var/run/engines + - name: HF_HOME + value: /var/run/cache +{{- with $.Values.logging }} + {{- with .tritonServer }} + {{- if .verbose }} + - name: NCCL_DEBUG + value: INFO + {{- end }} + {{- end }} +{{- end }} + image: {{ $image_name }} + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 15 + httpGet: + path: /v2/health/live + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 2 + successThreshold: 1 + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + readinessProbe: + failureThreshold: 15 + httpGet: + path: /v2/health/ready + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 2 + successThreshold: 1 + resources: + limits: + cpu: {{ $triton_cpu }} + ephemeral-storage: 1Gi + memory: {{ $triton_memory }} + nvidia.com/gpu: {{ $model_gpus }} + requests: + cpu: {{ $triton_cpu }} + ephemeral-storage: 1Gi + memory: {{ $triton_memory }} + nvidia.com/gpu: {{ $model_gpus }} + volumeMounts: +{{- if $model_trtllm }} + - mountPath: /var/run/engines + name: engine-repository + readOnly: false +{{- end }} + - mountPath: /var/run/models + name: model-repository + readOnly: true + - mountPath: /var/run/cache + name: transformers-cache + readOnly: false +{{- with $.Values.triton }} + {{- with .image }} + {{- with .pullSecrets }} + imagePullSecrets: +{{ toYaml . | indent 6 }} + {{- end }} + {{- end }} +{{- end }} + initContainers: + - name: init + command: + - python3 + - ./server.py + - init + - --model={{ $model_lower }} +{{- if $model_trtllm }} + - --engine=trtllm + - --dt={{ $model_dt }} + - --pp={{ $model_pp }} + - --tp={{ $model_tp }} +{{- else }} + - --engine=vllm +{{- end }} +{{- with $.Values.logging }} + {{- with .initialization }} + {{- if .verbose }} + - --verbose + {{- end }} + {{- end }} +{{- end }} + env: + - name: ENGINE_DEST_PATH + value: /var/run/engines + - name: HF_HOME + value: /var/run/cache + - name: HF_HUB_DISABLE_PROGRESS_BARS + value: "1" + - name: HF_HUB_DISABLE_TELEMETRY + value: "1" + - name: HF_HUB_VERBOSITY + value: info + - name: NO_COLOR + value: "1" + - name: TERM + value: none +{{- with $.Values.logging }} + {{- with .initialization }} + {{- if .verbose }} + - name: TRITON_CLI_VERBOSE + value: "1" + {{- end }} + {{- end }} +{{- end }} + image: {{ $image_name }} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: {{ $triton_cpu }} + ephemeral-storage: 96Gi + nvidia.com/gpu: {{ $model_gpus }} + requests: + cpu: {{ $triton_cpu }} + ephemeral-storage: 96Gi + nvidia.com/gpu: {{ $model_gpus }} + volumeMounts: +{{- if $model_trtllm }} + - mountPath: /var/run/engines + name: engine-repository + readOnly: false +{{- end }} + - mountPath: /var/run/models + name: model-repository + readOnly: false + - mountPath: /var/run/cache + name: transformers-cache + readOnly: false +{{- with $.Values.model }} + {{- if .pullSecret }} + - mountPath: /var/run/secrets/hugging_face + name: hf-secret + readOnly: true + {{- end }} +{{- end }} + restartPolicy: Always + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +{{- with $.Values.kubernetes }} + {{- with .tolerations }} +{{ toYaml . | indent 6 }} + {{- end }} +{{- end }} + volumes: +{{- if $model_trtllm }} + - name: engine-repository + hostPath: + path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }} + type: DirectoryOrCreate +{{- end }} + - name: model-repository + hostPath: +{{- if $model_trtllm }} + path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }} +{{- else }} + path: {{ printf "%s/models/%s/vllm" $hostRootPath $model_lower }} +{{- end }} + type: DirectoryOrCreate +{{- with $.Values.model }} + {{- with .pullSecret }} + - name: hf-secret + secret: + secretName: {{ . }} + {{- end }} +{{- end }} + - name: transformers-cache + hostPath: + path: {{ $hostRootPath }}/huggingface + type: DirectoryOrCreate diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/horizontal-pod-autoscaler.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/horizontal-pod-autoscaler.yaml new file mode 100644 index 00000000..96910f4a --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/horizontal-pod-autoscaler.yaml @@ -0,0 +1,66 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- $metric_name := "triton:queue_compute:ratio" }} +{{- $metric_value := "1000m" }} +{{- $replicasMax := 4 }} +{{- $replicasMin := 1 }} +{{- with $.Values.autoscaling }} + {{- if .enable }} + {{- with .replicas }} + {{- with .maximum }} + {{- $replicasMax = . }} + {{- end }} + {{- with .minimum }} + {{- $replicasMin = . }} + {{- end }} + {{- end }} + {{- with .metric }} + {{- with .name }} + {{- $metric_name = . }} + {{- end }} + {{- with .value }} + {{- $metric_value = . }} + {{- end }} + {{- end }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: autoscaler + release: prometheus +{{- with $.Values.kubernetes }} + {{- with .labels }} +{{ toYaml . | indent 4 }} + {{- end }} +{{- end }} +spec: + maxReplicas: {{ $replicasMax }} + minReplicas: {{ $replicasMin }} + metrics: + - type: Pods + pods: + metric: + name: {{ $metric_name }} + target: + type: AverageValue + averageValue: {{ $metric_value }} + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ $.Release.Name }} + {{- end }} +{{- end }} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/pod-monitor.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/pod-monitor.yaml new file mode 100644 index 00000000..ddd06d00 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/pod-monitor.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: autoscaler + release: prometheus +{{- with $.Values.kubernetes }} + {{- with .labels }} +{{ toYaml . | indent 4 }} + {{- end }} +{{- end }} +spec: + selector: + matchLabels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: server + podMetricsEndpoints: + - port: metrics + path: /metrics diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/service.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/service.yaml new file mode 100644 index 00000000..8a4783a2 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/service.yaml @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- $noService := false }} +{{- with $.Values.kubernetes }} + {{- with .noService }} + {{- $noService = . }} + {{- end }} +{{- end }} +{{- if $noService }} +# Chart values optioned to not create a service. Service not created. +{{- else }} +apiVersion: v1 +kind: Service +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: service +{{- with $.Values.kubernetes }} + {{- with .labels }} +{{ toYaml . | indent 4 }} + {{- end }} +{{- end }} +spec: + ports: + - name: http + port: 8000 + targetPort: http + - name: grpc + port: 8001 + targetPort: grpc + - name: metrics + port: 8002 + targetPort: metrics + selector: + app: {{ $.Release.Name }} + type: ClusterIP +{{- end }} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json new file mode 100644 index 00000000..bb911ca7 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json @@ -0,0 +1,347 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema#", + "copyright": [ + "# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.", + "# NVIDIA CORPORATION and its licensors retain all intellectual property", + "# and proprietary rights in and to this software, related documentation", + "# and any modifications thereto. Any use, reproduction, disclosure or", + "# distribution of this software and related documentation without an express", + "# license agreement from NVIDIA CORPORATION is strictly prohibited." + ], + "properties": { + "gpu": { + "description": "List of the GPUs support `.model` and to which Triton Server instances can be deployed.", + "items": [ + { + "description": "Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label.", + "type": "string" + } + ], + "type": "array" + }, + "model": { + "description": "Configuration options related to the AI model to be deployed.", + "properties": { + "name": { + "description": "Name of the model to be served Triton Server instances.", + "pattern": "(gpt2|opt125m|llama-(2-(7b|70b)(-chat)?|3-(8b|70b)(-instruct)?))", + "type": "string" + }, + "pullSecret": { + "description": "Name of the secret used to download the model from Hugging Face.", + "oneOf": [ + { + "type": "string" + }, + { "type": "null" } + ] + }, + "tensorrtLlm": { + "description": "Configuration options related to the conversion of a non-optimized model into TensorRT format.", + "oneOf": [ + { + "properties": { + "dataType": { + "description": "Data type used when compiling and optimizing the model for TensorRT.", + "oneOf": [ + { + "pattern": "(bfloat16|float16|float32)", + "type": "string" + }, + { "type": "null" } + ] + }, + "enable": { + "description": "When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.", + "oneOf": [ + { + "type": "boolean" + }, + { "type": "null" } + ] + }, + "parallelism": { + "description": "Parallelism configuration options which affect how the model is converted to TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs.", + "oneOf": [ + { + "properties": { + "pipeline": { + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + }, + "tensor": { + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "required": [ "name" ], + "type": "object" + }, + "triton": { + "description": "Configuration options for Triton Server.", + "properties": { + "image": { + "description": "Configuration options related to the container image for Triton Server.", + "properties": { + "pullSecrets": { + "description": "Optional list of pull secrets to be used when downloading the Triton Server container image.", + "oneOf": [ + { + "items": [ + { + "type": "object" + } + ], + "type": "array" + }, + { "type": "null" } + ] + }, + "name": { + "description": "Name of the container image containing the version of Triton Server to be used.", + "type": "string" + } + }, + "required": [ "name" ], + "type": "object" + }, + "resources": { + "description": "Configuration options managing the resources assigned to individual Triton Server instances. ", + "oneOf": [ + { + "properties": { + "cpu": { + "description": "Number of logical CPU cores reserved for, and assigned to each instance of Triton Server.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "pattern": "^\\d+m$", + "type": "string" + }, + { "type": "null" } + ] + }, + "memory": { + "description": "Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server.", + "oneOf": [ + { + "pattern": "^\\d+[GKMgkm]i$", + "type": "string" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "required": [ "image" ], + "type": "object" + }, + "autoscaling": { + "description": "Configuration options for automatic scaling of Triton Server deployments.", + "oneOf": [ + { + "properties": { + "enable": { + "description": "Determines if autoscaling is enabled for deployment or not.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "metric": { + "description": "Metric used to determine autoscaling decisions.", + "oneOf": [ + { + "properties": { + "name": { + "description": "Name of the metric monitored.", + "oneOf": [ + { "type": "string" }, + { "type": "null" } + ] + }, + "value": { + "description": "Threshold or targeted value used to determine the number of replicas concurrently deployed." + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "replicas": { + "description": "Controls the number of Triton Server replicas are deployed.", + "oneOf": [ + { + "properties": { + "maximum": { + "description": "Upper bound of the number of Triton Server replicas deployed concurrently.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + }, + "minimum": { + "description": "Lower bound of the number of Triton Server replicas deployed concurrently.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "logging": { + "description": "Configuration options related to how various components generate logs.", + "oneOf": [ + { + "properties": { + "initialization": { + "description": "Logging configuration options specific to the initialization container.", + "oneOf": [ + { + "properties": { + "verbose": { + "description": "When `true` the model download and generation of TRT engine and plan use verbose logging; otherwise standard logging is used.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "tritonServer": { + "description": "Logging configuration options specific to Triton Server.", + "oneOf": [ + { + "properties": { + "useIso8601": { + "description": "When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used. ", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "verbose": { + "description": "When `true` Triton Server uses verbose logging; otherwise standard logging is used.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "kubernetes": { + "description": "Configurations option related to the Kubernetes objects created by the chart.", + "oneOf": [ + { + "properties": { + "hostRootPath": { + "description": "Root file-system path used when mounting content to the underlying host.", + "oneOf": [ + { "type": "string" }, + { "type": "null" } + ] + }, + "labels": { + "description": "Optional set of labels to be applied to created Kubernetes objects.", + "oneOf": [ + { "type": "object" }, + { "type": "null" } + ] + }, + "noService": { + "description": "When `false`, a service will not be created when the chart is installed; otherwise a service will be created.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "tolerations": { + "description": "Tolerations applied to every pod deployed as part of this deployment.", + "oneOf": [ + { + "items": [ + { + "description": "Toleration applied to every pod deployed as part of this deployment.", + "type": "object" + }, + { "type": "null" } + ], + "type": "array" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "required": [ + "gpu", + "model", + "triton" + ] +} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml new file mode 100644 index 00000000..114c10f4 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml @@ -0,0 +1,127 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# List of the GPUs support `.model` and to which Triton Server instances can be deployed. +# Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label. +# Run 'kubectl get nodes' to find node names. +# Run 'kubectl describe node ' to inspect a node's labels. +gpu: # (required) +# - NVIDIA-A100-SXM4-40GB +# - NVIDIA-A10G +# - Tesla-V100-SXM2-16GB +# - Tesla-T4 + +# Configuration options related to the AI model to be deployed. +model: # (required) + # Name of the model to be served Triton Server instances. + # Supported values are: + # - gpt2 + # - llama-2-7b + # - llama-2-70b + # - llama-2-7b-chat + # - llama-2-70b-chat + # - llama-3-8b + # - llama-3-70b + # - llama-3-8b-instruct + # - llama-3-70b-instruct + # - opt125m + name: # (required) + # Configuration options related to the conversion of a non-optimized model into TensorRT format. + tensorrtLlm: # (optional) + # When `true`, enables conversion of models into TensorRT format before loading them into Triton Server. + # When 'false', the init container will fall back to vLLM and parallelism options are ignored. + enable: # (default: true) + # Data type used when compiling and optimizing the model for TensorRT. + # Supported options are float16, bfloat16, float32 + dataType: # (default: float16) + # Parallelism configuration options which affect how the model is converted to + # TensorRT-LLM format, specifically if/how the model is partitioned for deployment to + # multiple GPUs. + parallelism: # (optional) + # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a + # subset of layers that is executed on a separate device. + # The main limitation of this method is that, due to the sequential nature of the processing, some devices or + # layers may remain idle while waiting for the output. + pipeline: # (default: 1) + # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, + # independent blocks of computation that can be executed on different devices. + # Attention blocks and multi-layer perceptron (MLP) layers are major components of transformers that can take advantage of + # tensor parallelism. + # In multi-head attention blocks, each head or group of heads can be assigned to a different device so they can be computed + # independently and in parallel. + tensor: # (default: 1) + # Name of the secret used to download the model from Hugging Face. + # GPT2 does not require an access token to download. + # Other models may require per repository permissions to be granted. + pullSecret: # (optional) + +# Configuration options for Triton Server. +triton: # (required) + # Configuration options related to the container image for Triton Server. + image: # (required) + # Optional list of pull secrets to be used when downloading the Triton Server container image. + pullSecrets: # (optional) + # - name: ngc-container-pull + # Name of the container image containing the version of Triton Server to be used. + name: # (required) + # Configuration options managing the resources assigned to individual Triton Server instances. + resources: # (optional) + # Number of logical CPU cores reserved for, and assigned to each instance of Triton Server. + cpu: # (default: 4) + # Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server. + memory: # (default: 32Gi) + +# Configuration options for automatic scaling of Triton Server deployments. +autoscaling: # (optional) + # Determines if autoscaling is enabled for deployment or not. + enable: true # (default: true) + # Controls the number of Triton Server replicas are deployed. + replicas: # (optional) + # Upper bound of the number of Triton Server replicas deployed concurrently. + maximum: # (default: 4) + # Lower bound of the number of Triton Server replicas deployed concurrently. + minimum: # (default: 1) + # Metric used to determine autoscaling decisions. + metric: # (optional) + # Name of the metric monitored. + name: # (default: triton:queue_compute:ratio) + # Threshold or targeted value used to determine the number of replicas concurrently deployed. + value: # (default: 1) + +# Configuration options related to how various components generate logs. +logging: # (optional) + # Logging configuration options specific to the initialization container. + initialization: + # When `true` the model download and generation of TRT engine and plan use verbose logging; otherwise standard logging is used. + verbose: # (default: false) + # Logging configuration options specific to Triton Server. + tritonServer: + # When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used. + useIso8601: # (default: false) + # When `true` Triton Server uses verbose logging; otherwise standard logging is used. + verbose: # (default: false) + +# Configurations option related to the Kubernetes objects created by the chart. +kubernetes: # (optional) + # Root file-system path used when mounting content to the underlying host. + hostRootPath: # (default: /triton) + # Optional set of labels to be applied to created Kubernetes objects. + # These labels can be used for association with a preexisting service object. + labels: # (optional) + # customLabel: value + # When `false`, a service will not be created when the chart is installed; otherwise a service will be created. + noService: # (default: false) + # Tolerations applied to every pod deployed as part of this deployment. + # Template already includes `nvidia.com/gpu=present:NoSchedule`. + tolerations: # (optional) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/README.md b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/README.md new file mode 100644 index 00000000..f7e29c92 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/README.md @@ -0,0 +1,6 @@ +# Client Inference Generators + +The files in this folder are for the deployment of client pods in the same cluster as a model hosted by Triton + TRT-LLM using +the provided sample Helm chart. +Each file creates a single deployment of a client container which can be used to generate inference requests for the deployed +model. diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/gpt2.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/gpt2.yaml new file mode 100644 index 00000000..f5731148 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/gpt2.yaml @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: client-gpt2 +spec: + selector: + matchLabels: + app: client-gpt2 + replicas: 1 + template: + metadata: + labels: + app: client-gpt2 + app.kubernetes.io/component: client + spec: + containers: + - name: client + command: + - python3 + - ./client.py + env: + - name: TRTLLM_MODEL_NAME + value: gpt2 + - name: TRTLLM_TRITON_URL + value: gpt2 + - name: TRTLLM_MAX_TOKENS + value: "256" + # - name: TRTLLM_DEBUG + # value: debug + image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 1000m + ephemeral-storage: 1Gi + memory: 1Gi + requests: + cpu: 500m + ephemeral-storage: 1Gi + memory: 1Gi + imagePullSecrets: + - name: ngc-container-pull + restartPolicy: Always diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-70b-instruct.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-70b-instruct.yaml new file mode 100644 index 00000000..bd17e174 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-70b-instruct.yaml @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: client-llama-2-70b-instruct +spec: + selector: + matchLabels: + app: client-llama-2-70b-instruct + replicas: 1 + template: + metadata: + labels: + app: client-llama-2-70b-instruct + app.kubernetes.io/component: client + spec: + containers: + - name: client + command: + - python3 + - ./client.py + env: + - name: TRTLLM_MODEL_NAME + value: llama-2-70b-instruct + - name: TRTLLM_TRITON_URL + value: llama-2-70b-instruct + # - name: TRTLLM_MAX_TOKENS + # value: "512" + # - name: TRTLLM_DEBUG + # value: debug + image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 1000m + ephemeral-storage: 1Gi + memory: 2Gi + requests: + cpu: 750m + ephemeral-storage: 1Gi + memory: 1536Mi + imagePullSecrets: + - name: ngc-container-pull + restartPolicy: Always diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-7b.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-7b.yaml new file mode 100644 index 00000000..fda18528 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-7b.yaml @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: client-llama-2-7b +spec: + selector: + matchLabels: + app: client-llama-2-7b + replicas: 1 + template: + metadata: + labels: + app: client-llama-2-7b + app.kubernetes.io/component: client + spec: + containers: + - name: client + command: + - python3 + - ./client.py + env: + - name: TRTLLM_MODEL_NAME + value: llama-2-7b + - name: TRTLLM_TRITON_URL + value: llama-2-7b + # - name: TRTLLM_MAX_TOKENS + # value: "512" + # - name: TRTLLM_DEBUG + # value: debug + image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 1000m + ephemeral-storage: 1Gi + memory: 2Gi + requests: + cpu: 750m + ephemeral-storage: 1Gi + memory: 1536Mi + imagePullSecrets: + - name: ngc-container-pull + restartPolicy: Always diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b-instruct.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b-instruct.yaml new file mode 100644 index 00000000..7cbeaf5d --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b-instruct.yaml @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: client-llama-3-8b-instruct +spec: + selector: + matchLabels: + app: client-llama-3-8b-instruct + replicas: 1 + template: + metadata: + labels: + app: client-llama-3-8b-instruct + app.kubernetes.io/component: client + spec: + containers: + - name: client + command: + - python3 + - ./client.py + env: + - name: TRTLLM_MODEL_NAME + value: llama-3-8b-instruct + - name: TRTLLM_TRITON_URL + value: llama-3-8b-instruct + # - name: TRTLLM_MAX_TOKENS + # value: "512" + # - name: TRTLLM_DEBUG + # value: debug + image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 1000m + ephemeral-storage: 1Gi + memory: 2Gi + requests: + cpu: 750m + ephemeral-storage: 1Gi + memory: 1536Mi + imagePullSecrets: + - name: ngc-container-pull + restartPolicy: Always diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b.yaml new file mode 100644 index 00000000..d83fbaf2 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b.yaml @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: client-llama-3-8b +spec: + selector: + matchLabels: + app: client-llama-3-8b + replicas: 1 + template: + metadata: + labels: + app: client-llama-3-8b + app.kubernetes.io/component: client + spec: + containers: + - name: client + command: + - python3 + - ./client.py + env: + - name: TRTLLM_MODEL_NAME + value: llama-3-8b + - name: TRTLLM_TRITON_URL + value: llama-3-8b + image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 1000m + ephemeral-storage: 1Gi + memory: 2Gi + requests: + cpu: 750m + ephemeral-storage: 1Gi + memory: 1536Mi + imagePullSecrets: + - name: ngc-container-pull + restartPolicy: Always diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/opt125m.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/opt125m.yaml new file mode 100644 index 00000000..3c3f2f0f --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/opt125m.yaml @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: client-opt125m +spec: + selector: + matchLabels: + app: client-opt125m + replicas: 1 + template: + metadata: + labels: + app: client-opt125m + app.kubernetes.io/component: client + spec: + containers: + - name: client + command: + - python3 + - ./client.py + env: + - name: TRTLLM_MODEL_NAME + value: opt125m + - name: TRTLLM_TRITON_URL + value: opt125m + # - name: TRTLLM_MAX_TOKENS + # value: "512" + # - name: TRTLLM_DEBUG + # value: debug + image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 1000m + ephemeral-storage: 1Gi + memory: 2Gi + requests: + cpu: 750m + ephemeral-storage: 1Gi + memory: 1536Mi + imagePullSecrets: + - name: ngc-container-pull + restartPolicy: Always diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/README.md b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/README.md new file mode 100644 index 00000000..ed170ac6 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/README.md @@ -0,0 +1,15 @@ +# Container Generation + +The files in this folder are intended to be used to create the Triton Server container image. + +Run the following command to create a Triton Server container image. + +```bash +docker build --file ./server.containerfile --tag . +``` + +Run the following command to create a client load generation container image. + +```bash +docker build --file ./client.containerfile --tag . +``` diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.containerfile b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.containerfile new file mode 100644 index 00000000..8030c89d --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.containerfile @@ -0,0 +1,39 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +ARG BASE_CONTAINER_IMAGE=ubuntu:jammy + +FROM ${BASE_CONTAINER_IMAGE} + +# Set a set of useful labels. +LABEL "base"="${BASE_CONTAINER_IMAGE}" +LABEL "role"="client" + +# Stop APT (Debian package manager) from complaining about interactivity. +ENV DEBIAN_FRONTEND=noninteractive +# Set additional environment values that make usage more pleasant. +ENV TERM=xterm-256color + +RUN apt update \ + && apt install --fix-missing --no-install-recommends --yes \ + ca-certificates \ + wget \ + apt-transport-https \ + software-properties-common \ + python3 \ + python3-pip \ + icu-devtools \ + curl \ + git \ + && apt autoremove --yes \ + && apt purge --yes \ + && rm -rf /var/lib/apt/lists/* + +COPY client.py . + +ENTRYPOINT [ "/bin/bash" ] diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.py b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.py new file mode 100644 index 00000000..ce57b84d --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.py @@ -0,0 +1,191 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import json +import os +import re +import subprocess +import sys +import time + +ERROR_CODE_FATAL = 255 +EXIT_CODE_SUCCESS = 0 + +DEBUG_KEY = "TRTLLM_DEBUG" +MAX_TOKENS_KEY = "TRTLLM_MAX_TOKENS" +MODEL_NAME_KEY = "TRTLLM_MODEL_NAME" +TRITON_URL_KEY = "TRTLLM_TRITON_URL" + +MAX_TOKENS_DEFAULT = 512 + +is_debug = False + +debug_value = os.getenv(DEBUG_KEY) + +if debug_value is not None: + is_debug = ( + debug_value == "1" + or debug_value == "true" + or debug_value == "yes" + or debug_value == "debug" + ) + +model_name = os.getenv(MODEL_NAME_KEY) + +if model_name is None: + raise Exception(f"Required environment variable '{MODEL_NAME_KEY}' not provided.") + +if is_debug: + print(f'model_name: "{model_name}".', file=sys.stdout, flush=True) + +triton_url = os.getenv(TRITON_URL_KEY) + +if triton_url is None: + raise Exception(f"Required environment variable '{TRITON_URL_KEY}' not provided.") + +triton_url = f"{triton_url}:8000/v2/models/{model_name}/generate" + +if is_debug: + print(f'triton_url: "{triton_url}".', file=sys.stdout, flush=True) + +max_tokens = MAX_TOKENS_DEFAULT + +max_token_value = os.getenv(MAX_TOKENS_KEY) +if max_token_value is not None: + try: + max_tokens = int(max_token_value) + + except: + print( + f"error: Environment variable {MAX_TOKENS_KEY}={max_token_value} is not valid and will be ignored.", + file=sys.stderr, + flush=True, + ) + print(" ", file=sys.stderr, flush=True) + +if is_debug: + print(f"max_tokens: {max_tokens}.", file=sys.stdout, flush=True) + +prompts = [ + "What is the market capitalization of NVIDIA?", + "What is the largest company in the world?", + "Who was the first president of France?", + "How tall was Napoleon?", + "Which colors are in the German flag?", + "Does China have a national animal?", + "What time is it in London?", + "Write me a rap song using references to SpongeBob SquarePants.", + "Give me 10 date-night ideas for my partner and me, but include ideas that we can do in the house, outdoors, and within a 10-mile radius.", + "Write a short story about a unicorn and a postbox using only emojis.", + "Write me advice on career planning, including how I can make steps towards financial goals and getting a promotion.", + "Suggest 10 web extensions students can use to increase productivity.", + "Write a strategy for how I can stay motivated at work and maintain focus.", + "Create a bulleted list of organic supplements that boost metabolism.", + "Which airlines have the best customer experience for long-haul flights?", +] + +index = 0 +error_count = 0 + +# Do this forever, or at least until a SIGABRT, SIGINT, or SIGKILL terminates the process. +while True: + question = prompts[index] + + if is_debug: + print(f'question: "{question}".') + + # Create a JSON encoded inference payload. + payload = json.dumps({"text_input": question, "max_tokens": max_tokens}) + + if is_debug: + print(f'payload: "{payload}".') + + # Build up the subprocess args. + args = ["curl", "-X", "POST", "-s", triton_url, "-d", payload] + + if is_debug: + print(f"args: {args}") + + # Concat a human friendly command line and then log it. + command = "" + for arg in args: + command += arg + command += " " + + print(f"> {command}", file=sys.stdout, flush=True) + + index += 1 + index %= len(prompts) + + # Run the subprocess and catch any failures. + try: + time_start = time.time() + + sp_ran = subprocess.run(args, capture_output=True, check=True) + + if sp_ran.returncode != 0: + print(sp_ran.stderr, file=sys.stderr, flush=True) + print(" ", file=sys.stderr, flush=True) + + raise Exception(f'Inference command failed: "{exception}".') + + time_end = time.time() + + print( + f" completed in {(time_end - time_start)} seconds.", + file=sys.stdout, + flush=True, + ) + print(" ", file=sys.stdout, flush=True) + + output = sp_ran.stdout + + if is_debug: + print(f'output: "{output}".', file=sys.stdout, flush=True) + + result = json.loads(output) + + if is_debug: + print(f'result: "{result}".', file=sys.stdout, flush=True) + + text_output = result["text_output"] + + if is_debug: + print(f'text_output: "{text_output}".', file=sys.stdout, flush=True) + + answers = re.split("(\s{2,}|\n)", text_output) + + print("Prompt:", file=sys.stdout, flush=True) + print(f" {question}", file=sys.stdout, flush=True) + print(" ", file=sys.stdout, flush=True) + + print("Response:", file=sys.stdout, flush=True) + + for answer in answers: + print(f" {answer.strip()}", file=sys.stdout, flush=True) + + print(" ", file=sys.stdout, flush=True) + + except Exception as exception: + error_count += 1 + + print(" ", file=sys.stderr, flush=True) + print(f"error: {exception}", file=sys.stderr, flush=True) + print( + f" Inference command has failed {error_count} time(s).", + file=sys.stderr, + flush=True, + ) + + if error_count > 30: + print(f"fatal: Quitting after 30 failures.", file=sys.stderr, flush=True) + exit(ERROR_CODE_FATAL) + + # 250ms delay between inference requests. + time.sleep(0.250) + print(" ", file=sys.stdout, flush=True) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.containerfile b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.containerfile new file mode 100644 index 00000000..3d18d5f3 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.containerfile @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 +ARG ENGINE_DEST_PATH=/var/run/engines +ARG HF_HOME=/var/run/cache + +FROM ${BASE_CONTAINER_IMAGE} + +# Set a set of useful labels. +LABEL "base"="${BASE_CONTAINER_IMAGE}" +LABEL "role"="server" + +# Stop APT (Debian package manager) from complaining about interactivity. +ENV DEBIAN_FRONTEND=noninteractive +# Set additional environment values that make usage more pleasant. +ENV TERM=xterm-256color + +# Set Triton CLI environment variables which control where +# TRTLLM engine and model files are downloaded to; and where +# the path to the Huggingface cache. +ENV ENGINE_DEST_PATH ${ENGINE_DEST_PATH} +ENV HF_HOME ${HF_HOME} + +# Set the active working directory. +WORKDIR /workspace + +# Install a custom version of Triton CLI that support Tensor parallelism and +# the 70B version of Llama models. +RUN pip --verbose install \ + --no-cache-dir \ + --no-color \ + --no-input \ + git+https://github.com/triton-inference-server/triton_cli.git@jwyman/aslb-mn + +# Copy the server script. +COPY server.py . + +RUN apt list --installed \ + && pip list --version + +ENTRYPOINT [ "/bin/bash" ] diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py new file mode 100644 index 00000000..d27ed52c --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py @@ -0,0 +1,342 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import subprocess +import sys +import time + +# These values are expected to match the mount points in the Helm Chart. +# Any changes here must also be made there, and vice versa. +CACHE_DIRECTORY = "/var/run/cache" +HF_SECRET_PATH = "/var/run/secrets/hugging_face/password" +MODEL_DIRECTORY = "/var/run/models" + +ERROR_EXIT_DELAY = 15 +ERROR_CODE_FATAL = 255 +ERROR_CODE_USAGE = 253 +EXIT_SUCCESS = 0 + +# Environment variable keys. +CLI_VERBOSE_KEY = "TRITON_CLI_VERBOSE" +ENGINE_PATH_KEY = "ENGINE_DEST_PATH" +HUGGING_FACE_KEY = "HF_HOME" + +HUGGING_FACE_CLI = "huggingface-cli" + +# --- + + +def clean_directory(directory_path: str): + if os.path.exists(directory_path): + for root, dirs, files in os.walk(top=directory_path, topdown=False): + for name in files: + file_path = os.path.join(directory_path, name) + try: + os.remove(file_path) + + except Exception as exception: + write_error(f" Failed to remove {file_path}") + write_error(f" {exception}") + + for name in dirs: + dir_path = os.path.join(directory_path, name) + write_error(f" - Removing {dir_path}") + try: + os.rmdir(dir_path) + + except Exception as exception: + write_error(f" Failed to remove {dir_path}") + write_error(f" {exception}") + + +# --- + + +def die(exception: Exception=None): + if exception is not None: + write_error(f"fatal: {exception}") + + write_error(f" Waiting {ERROR_EXIT_DELAY} second before exiting.") + # Delay the process' termination to provide a small window for administrators to capture the logs before it exits and restarts. + time.sleep(ERROR_EXIT_DELAY) + + exit(ERROR_CODE_USAGE) + + +# --- + + +def hugging_face_authenticate(args): + # Validate that `HF_HOME` environment variable was set correctly. + if HUGGING_FACE_HOME is None or len(HUGGING_FACE_HOME) == 0: + raise Exception(f"Required environment variable '{HUGGING_FACE_KEY}' not set.") + + # When a Hugging Face secret has been mounted, we'll use that to authenticate with Hugging Face. + if os.path.exists(HF_SECRET_PATH): + with open(HF_SECRET_PATH) as token_file: + write_output( + f"Hugging Face token file '{HF_SECRET_PATH}' detected, attempting to authenticate w/ Hugging Face." + ) + write_output(" ") + + hf_token = token_file.read() + + # Use Hugging Face's CLI to complete the authentication. + result = run_command([HUGGING_FACE_CLI, "login", "--token"], [hf_token]) + + if result != 0: + raise Exception(f"Hugging Face authentication failed. ({result})") + + write_output("Hugging Face authentication successful.") + write_output(" ") + + +# --- + + +def run_command(cmd_args: [], extra_args: [] = None): + command = " ".join(cmd_args) + + if extra_args is not None and len(extra_args) > 0: + command += "****" + cmd_args += extra_args + + write_output(f"> {command}") + write_output(" ") + + # Run triton_cli to build the TRT-LLM engine + plan. + return subprocess.call(cmd_args, stderr=sys.stderr, stdout=sys.stdout) + + +# --- + + +def write_output(message: str): + print(message, file=sys.stdout, flush=True) + + +# --- + + +def write_error(message: str): + print(message, file=sys.stderr, flush=True) + + +# --- +# Below this line are the primary functions. +# --- + + +def execute_triton(args): + world_size = args.tp * args.pp + + if world_size <= 0: + raise Exception( + "usage: Options --pp and --pp must both be equal to or greater than 1." + ) + + # Single GPU setups can start a tritonserver process directly. + if world_size == 1: + cmd_args = [ + "tritonserver", + "--allow-cpu-metrics=false", + "--allow-gpu-metrics=false", + "--allow-metrics=true", + "--metrics-interval-ms=1000", + f"--model-repository={MODEL_DIRECTORY}", + "--model-load-thread-count=2", + "--strict-readiness=true", + ] + + if args.verbose > 0: + cmd_args += ["--log-verbose=1"] + + if args.iso8601 > 0: + cmd_args += ["--log-format=ISO8601"] + + # Multi-GPU setups require a specialized command line which based on `mpirun`. + else: + cmd_args = ["mpirun", "--allow-run-as-root"] + + for i in range(world_size): + cmd_args += [ + "-n", + "1", + "tritonserver", + f"--model-repository={MODEL_DIRECTORY}", + "--disable-auto-complete-config", + ] + cmd_args += [ + f"--http-port={(8000 + i * 10)}", + f"--grpc-port={(8001 + i * 10)}", + "--model-load-thread-count=2", + ] + + if i == 0: + cmd_args += [ + "--allow-cpu-metrics=false", + "--allow-gpu-metrics=false", + "--allow-metrics=true", + "--metrics-interval-ms=1000", + f"--id=rank{i}", + ] + + if args.verbose > 0: + cmd_args += ["--log-verbose=1"] + + if args.iso8601 > 0: + cmd_args += ["--log-format=ISO8601"] + + else: + cmd_args += [ + "--allow-http=false", + "--allow-grpc=false", + "--allow-metrics=false", + ] + cmd_args += ["--log-info=false", "--log-warning=false"] + + cmd_args += [ + "--disable-auto-complete-config", + f"--backend-config=python,shm-region-prefix-name=rank{i}_", + ":", + ] + + result = run_command(cmd_args) + exit(result) + + +# --- + + +def initialize_model(args): + if args.model is None or len(args.model) == 0: + die("Model name must be provided.") + + hugging_face_authenticate(args) + + engine_path = os.path.join(ENGINE_DIRECTORY, args.model) + model_path = os.path.join(MODEL_DIRECTORY, args.model) + + # When the model and plan already exist, we can exit early, happily. + if os.path.exists(engine_path) and os.path.exists(model_path): + write_output( + f"TensorRT engine and plan detected for {args.model}. No work to do, exiting." + ) + exit(EXIT_SUCCESS) + + write_output(f"Begin generation of TensorRT engine and plan for {args.model}.") + write_output(" ") + + # Build up a set of args for the subprocess call. + cmd_args = [ + "triton", + "import", + "--model", + args.model, + "--model-repository", + MODEL_DIRECTORY, + ] + + if args.engine == "vllm": + cmd_args += ["--backend", "vllm"] + + else: + cmd_args += ["--backend", "tensorrtllm"] + + if args.dt is not None and args.dt in ["bfloat", "float16", "float32"]: + cmd_args += ["--data-type", args.dt] + + if args.pp > 1: + cmd_args += ["--pipeline-parallelism", f"{args.pp}"] + + if args.tp > 1: + cmd_args += ["--tensor-parallelism", f"{args.tp}"] + + # When verbose, insert the verbose flag. + # It is important to note that the flag must immediately follow `triton` and cannot be in another ordering position. + # This limitation will likely be removed a future release of triton_cli. + if is_verbose: + cmd_args.insert(1, "--verbose") + + result = run_command(cmd_args) + exit(result) + + +# --- + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("mode", type=str, choices=["exec", "init"]) + parser.add_argument("--model", type=str, default=None) + parser.add_argument( + "--dt", + type=str, + default="float16", + choices=["bfloat16", "float16", "float32"], + help="Tensor type.", + ) + parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism.") + parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism.") + parser.add_argument("--iso8601", action="count", default=0) + parser.add_argument("--verbose", action="count", default=0) + parser.add_argument( + "--engine", type=str, default="trtllm", choices=["trtllm", "vllm"] + ) + + return parser.parse_args() + + +# --- + +try: + ENGINE_DIRECTORY = os.getenv(ENGINE_PATH_KEY) + HUGGING_FACE_HOME = os.getenv(HUGGING_FACE_KEY) + + is_verbose = os.getenv(CLI_VERBOSE_KEY) is not None + + # Validate that `ENGINE_DIRECTORY` isn't empty. + if ENGINE_DIRECTORY is None or len(ENGINE_DIRECTORY) == 0: + raise Exception(f"Required environment variable '{ENGINE_PATH_KEY}' not set.") + + # Validate that `ENGINE_DIRECTORY` actually exists. + if not os.path.exists(ENGINE_DIRECTORY): + raise Exception(f"Engine directory '{ENGINE_DIRECTORY}' does not exist.") + + # Validate that `MODEL_DIRECTORY` actually exists. + if not os.path.exists(MODEL_DIRECTORY): + raise Exception(f"Model directory '{MODEL_DIRECTORY}' does not exist.") + + # Parse options provided. + args = parse_arguments() + + # Update the is_verbose flag with values passed in by options. + is_verbose = is_verbose or args.verbose > 0 + + if args.mode == "init": + initialize_model(args) + + elif args.mode == "exec": + execute_triton(args) + + else: + write_error(f"usage: server.py [].") + write_error(f' Invalid mode ("{args.mode}") provided.') + write_error(f' Supported values are "init" or "exec".') + die(None) + +except Exception as exception: + die(exception) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/grafana_inference-metrics_dashboard.json b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/grafana_inference-metrics_dashboard.json new file mode 100644 index 00000000..d761f4aa --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/grafana_inference-metrics_dashboard.json @@ -0,0 +1,1539 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 28, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 19, + "panels": [], + "title": "Cluster Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 1, + "fieldMinMax": false, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "orange", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "instance:node_cpu_utilisation:rate5m", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Node CPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 1, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "orange", + "value": 85 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "1-(node_memory_MemFree_bytes/node_memory_MemTotal_bytes)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Node Memory Utilization", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 14, + "panels": [], + "title": "Triton Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 0.8, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1 + }, + { + "color": "orange", + "value": 1.2 + }, + { + "color": "red", + "value": 1.5 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "max by(job) (triton:queue_compute:ratio)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "Autoscaling Metric", + "useBackend": false + } + ], + "title": "Queue : Compute Ratio", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "µs" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "max by(job) (triton:request_duration:average)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "maximum({{job}})", + "range": true, + "refId": "Maximum", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg by(job) (triton:request_duration:average)", + "hide": false, + "instant": false, + "legendFormat": "average({{job}})", + "range": true, + "refId": "Average" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "min by(job) (triton:request_duration:average)", + "hide": false, + "instant": false, + "legendFormat": "minimum({{job}})", + "range": true, + "refId": "Minimum" + } + ], + "title": "Request Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (rate(nv_inference_request_success[1m]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}(success)", + "range": true, + "refId": "Success", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (rate(nv_inference_request_failure[1m]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}(failure)", + "range": true, + "refId": "Failure", + "useBackend": false + } + ], + "title": "Requests / Second", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "µs" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (rate(nv_inference_compute_input_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}(input)", + "range": true, + "refId": "Input", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (rate(nv_inference_compute_output_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}(output)", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "I/O Duration", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 15, + "panels": [], + "title": "TRT-LLM Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 18 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg by(job) (rate(nv_trt_llm_general_metrics[1m]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "General", + "useBackend": false + } + ], + "title": "TRT-LLM General", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepBefore", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (nv_trt_llm_inflight_batcher_metrics)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "TRT-LLM In-Flight Batcher", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 18 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(job) (rate(nv_trt_llm_request_metrics[1m]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "TRT-LLM Requests", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 16, + "panels": [], + "title": "GPU Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 24 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "max by(Hostname,modelName,gpu) (avg_over_time(DCGM_FI_DEV_GPU_UTIL[$__interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{modelName}}/{{gpu}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 24 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg by(Hostname,modelName,gpu) (avg_over_time(DCGM_FI_DEV_GPU_TEMP[$__interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{modelName}}/{{gpu}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Temperature", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 24 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "avg by(Hostname,modelName,gpu) (avg_over_time(DCGM_FI_DEV_POWER_USAGE[$__interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{modelName}}/{{gpu}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": true, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 24 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max by(Hostname,modelName,gpu) (DCGM_FI_DEV_FB_USED/(DCGM_FI_DEV_FB_USED+DCGM_FI_DEV_FB_FREE))", + "instant": false, + "legendFormat": "{{modelName}}/{{gpu}}", + "range": true, + "refId": "A" + } + ], + "title": "GPU Memory Utilization", + "type": "timeseries" + } + ], + "refresh": "6s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-9m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "6s", + "12s", + "1m", + "5m", + "15m", + "1h", + "2h", + "5h" + ] + }, + "timezone": "utc", + "title": "Inference Metrics", + "uid": "fdklsxg5qqyo0c", + "version": 2, + "weekStart": "monday" +} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana-dashboard.png b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana-dashboard.png new file mode 100644 index 00000000..c6e37ffd Binary files /dev/null and b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana-dashboard.png differ diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_import-dashboard.png b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_import-dashboard.png new file mode 100644 index 00000000..40a3cb4a Binary files /dev/null and b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_import-dashboard.png differ diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_new-dashboard.png b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_new-dashboard.png new file mode 100644 index 00000000..bf04ba4b Binary files /dev/null and b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_new-dashboard.png differ diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_gpu-utilization.png b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_gpu-utilization.png new file mode 100644 index 00000000..2b94ef98 Binary files /dev/null and b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_gpu-utilization.png differ diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_queue-compute-ratio.png b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_queue-compute-ratio.png new file mode 100644 index 00000000..d464ee4e Binary files /dev/null and b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_queue-compute-ratio.png differ diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/nvidia_dcgm-exporter_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/nvidia_dcgm-exporter_values.yaml new file mode 100644 index 00000000..30111dad --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/nvidia_dcgm-exporter_values.yaml @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# All values are defaults unless specified otherwise. + +image: + repository: nvcr.io/nvidia/k8s/dcgm-exporter + pullPolicy: IfNotPresent + tag: 3.3.5-3.4.1-ubuntu22.04 + +arguments: + # Reduces the delay between GPU metrics collection passed to 1 second. +- --collect-interval=1000 +- --collectors=/etc/dcgm-exporter/dcp-metrics-included.csv + # Required. Enables Kubernetes specific metric collection features. +- --kubernetes=true + +serviceAccount: + create: true + annotations: { } + name: + +rollingUpdate: + maxUnavailable: 1 + maxSurge: 0 + +podLabels: { } + +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9400" + # Required by Prometheus Operator for proper metrics collection. + release: prometheus +podSecurityContext: { } + +securityContext: + # Enables advanced GPU metrics features. Optional. + privileged: true + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: [ "SYS_ADMIN" ] + +service: + enable: true + type: ClusterIP + port: 9400 + address: ":9400" + annotations: + prometheus.io/port: "9400" + prometheus.io/scrape: "true" + release: prometheus + +resources: + # Sets proper resource utilization limits, and enables Kubernetes to manage the pod's resource consumption. + # All contains should have these. + limits: + cpu: 2 + memory: 1Gi + # Sets proper resource requirements, and enables Kubernetes to account for the pod's resource consumption. + # All contains should have these. + requests: + cpu: 1 + memory: 1Gi + +serviceMonitor: + enabled: true + # Reduces the delay between metric collection passes. + interval: 1s + honorLabels: false + additionalLabels: + # Useful for helping Prometheus identify metrics collectors. + monitoring: prometheus + # Required by Prometheus to identify metrics collectors. + release: prometheus + +nodeSelector: + # Ensures that DCGM Exporter process is only deployed to nodes with GPUs. + nvidia.com/gpu: present + +tolerations: +# Enables the DCGM Exporter pods to be deployed to nodes with GPUs. +- key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # Ensures that DCGM Exporter process is only deployed to nodes with GPUs. + - key: nvidia.com/gpu + operator: Exists + +kubeletPath: "/var/lib/kubelet/pod-resources" diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/nvidia_gpu-feature-discovery_daemonset.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/nvidia_gpu-feature-discovery_daemonset.yaml new file mode 100644 index 00000000..02ac2cd8 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/nvidia_gpu-feature-discovery_daemonset.yaml @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# In the document below, the version `0.8.2` of the gpu-feature-discovery container is used. +# It is always wise to check if a new version has been released and to use the latest available release when possible. +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gpu-feature-discovery + namespace: kube-system + labels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/version: 0.8.2 + app.kubernetes.io/part-of: nvidia-gpu +spec: + selector: + matchLabels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/part-of: nvidia-gpu + template: + metadata: + labels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/version: 0.8.2 + app.kubernetes.io/part-of: nvidia-gpu + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + # The following set of node selector match expressions restrict the nodes the service's pods + # can be deployed to, to node which meet one or more of the following criteria: + # * Nodes with NVIDIA PCIE devices attached (10DE is NVIDIA's PCIE device number). + # * Nodes with NVIDIA CPUs. + # * Nodes with NVIDIA GPUs. + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - "true" + - matchExpressions: + - key: feature.node.kubernetes.io/cpu-model.vendor_id + operator: In + values: + - "NVIDIA" + - matchExpressions: + - key: "nvidia.com/gpu" + operator: In + values: + - "true" + - present + containers: + - image: nvcr.io/nvidia/gpu-feature-discovery:v0.8.2 + name: gpu-feature-discovery + volumeMounts: + - name: output-dir + mountPath: "/etc/kubernetes/node-feature-discovery/features.d" + - name: host-sys + mountPath: "/sys" + env: + - name: MIG_STRATEGY + value: none + securityContext: + privileged: true + # Enables the service's pods to be deployed on nodes with GPUs. + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + volumes: + - name: output-dir + hostPath: + path: "/etc/kubernetes/node-feature-discovery/features.d" + - name: host-sys + hostPath: + path: "/sys" diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/prometheus-adapter_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/prometheus-adapter_values.yaml new file mode 100644 index 00000000..39fbbcc2 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/prometheus-adapter_values.yaml @@ -0,0 +1,151 @@ +# All values are default unless otherwise specified. +affinity: { } + +topologySpreadConstraints: [ ] + +image: + repository: registry.k8s.io/prometheus-adapter/prometheus-adapter + tag: "" + pullPolicy: IfNotPresent + +logLevel: 4 + +# Update metrics at least ten times per minute. +metricsRelistInterval: 6s + +listenPort: 6443 + +nodeSelector: { } + +priorityClassName: "" + +namespaceOverride: "" + +customAnnotations: + role: custom-metrics + +customLabels: + monitoring: prometheus-adapter + +prometheus: + # Set ths to the name of the Prometheus service as shown by `kubectl get services -n monitoring`. + url: http://prometheus-kube-prometheus-prometheus + port: 9090 + path: /metrics + +replicas: 1 + +podSecurityContext: + fsGroup: 10001 + +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 10001 + seccompProfile: + type: RuntimeDefault + +rbac: + create: true + useAuthReaderClusterRole: false + externalMetrics: + resources: [ "*" ] + customMetrics: + resources: [ "*" ] + +psp: + create: false + annotations: { } + +serviceAccount: + create: true + name: + annotations: { } + +dnsConfig: { } + +resources: { } + +livenessProbe: + httpGet: + path: /healthz + port: https + scheme: HTTPS + initialDelaySeconds: 30 + timeoutSeconds: 5 + +readinessProbe: + httpGet: + path: /healthz + port: https + scheme: HTTPS + initialDelaySeconds: 30 + timeoutSeconds: 5 + +startupProbe: { } + +rules: + default: true + custom: [ ] + existing: + external: [ ] + + +service: + annotations: { } + port: 443 + type: ClusterIP + ipDualStack: + enabled: false + ipFamilies: [ "IPv6", "IPv4" ] + ipFamilyPolicy: "PreferDualStack" + +tls: + enable: false + ca: |- + # Public CA file that signed the APIService + key: |- + # Private key of the APIService + certificate: |- + # Public key of the APIService + +env: [ ] + +extraArguments: [ ] + +extraContainers: [ ] + +extraVolumes: [ ] + +extraVolumeMounts: [ ] + +tolerations: [ ] + +podLabels: { } + +podAnnotations: { } + +deploymentAnnotations: { } + +hostNetwork: + enabled: false + +strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 25% + maxSurge: 25% + +podDisruptionBudget: + enabled: false + minAvailable: + maxUnavailable: 1 + +certManager: + enabled: false + caCertDuration: 43800h0m0s + certDuration: 8760h0m0s + diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/triton-metrics_prometheus-rule.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/triton-metrics_prometheus-rule.yaml new file mode 100644 index 00000000..be91701a --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/triton-metrics_prometheus-rule.yaml @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: triton-metrics + labels: + app.kubernetes.io/component: autoscaler + release: prometheus +spec: + groups: + - name: autoscaling + interval: 6s + rules: + # Average number of microseconds inference requests take to compute after unqueueing (not including cache hits). + - expr: rate(nv_inference_compute_infer_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) + record: triton:compute_duration:average + # Average number of microseconds inference requests spend queue before being processed (not including cache hits). + - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) + record: triton:queue_duration:average + # Average number of microseconds inference requests take in total (not including cache hits). + - expr: rate(nv_inference_request_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) + record: triton:request_duration:average + # Average percentage of time inference requests spend in queue (not including cache hits). + - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_compute_infer_duration_us[1m]),1) + record: triton:queue_compute:ratio