From 4994c11918a8f3c3e80955f367596d841ac7e15f Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Fri, 18 Oct 2024 08:47:09 +0200 Subject: [PATCH 1/3] Update HuggingFace DLC for TGI URI to 2.3 (latest) --- modules/inference-service/main.tf | 2 +- .../genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md | 2 +- .../deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml | 2 +- .../genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md | 2 +- .../deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml | 2 +- .../genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md | 2 +- .../text-generation-interface.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf index 44655fc23..d15d608f8 100644 --- a/modules/inference-service/main.tf +++ b/modules/inference-service/main.tf @@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" { } } container { - image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310" + image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311" name = "mistral-7b-instruct" port { diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md index 6362c35a6..f40e5d1f8 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md @@ -104,7 +104,7 @@ Pod Template: Labels: app=mistral-7b Containers: mistral-7b: - Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 + Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 Port: 8080/TCP Host Port: 0/TCP Limits: diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml index 1828472a8..387a155ce 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml +++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml @@ -28,7 +28,7 @@ spec: spec: containers: - name: mistral-7b - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 resources: limits: nvidia.com/gpu: 1 diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md index 60739ffc6..089740edf 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md @@ -127,7 +127,7 @@ Pod Template: Labels: app=mixtral8x7b Containers: mixtral8x7b: - Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 + Image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 Port: 8080/TCP Host Port: 0/TCP Limits: diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml index 72a7e61d6..46a1c9475 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml +++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml @@ -30,7 +30,7 @@ spec: cloud.google.com/gke-accelerator: "nvidia-l4" containers: - name: mixtral8x7b - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 ports: - name: server-port containerPort: 8080 diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md index 617e4072c..4574a8a70 100644 --- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md @@ -76,7 +76,7 @@ spec: spec: containers: - name: llama-2-70b - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 resources: limits: nvidia.com/gpu: 2 diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml index a9963a719..462fce210 100644 --- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml +++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml @@ -28,7 +28,7 @@ spec: spec: containers: - name: llama-2-70b - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 resources: limits: nvidia.com/gpu: 2 From bceb12a97fec8440fb3c5ca39e481eb64fb2d814 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Tue, 7 Jan 2025 09:50:49 +0100 Subject: [PATCH 2/3] Update `mountPath` to point to `/tmp` instead of `/data` --- modules/inference-service/main.tf | 6 +++--- .../deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml | 6 +++--- .../mixtral-8x7b.yaml | 4 ++-- .../text-generation-interface.yaml | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf index d15d608f8..a6ce48819 100644 --- a/modules/inference-service/main.tf +++ b/modules/inference-service/main.tf @@ -132,8 +132,8 @@ resource "kubernetes_deployment" "inference_deployment" { } volume_mount { - mount_path = "/data" - name = "data" + mount_path = "/tmp" + name = "tmp" } volume_mount { @@ -166,7 +166,7 @@ resource "kubernetes_deployment" "inference_deployment" { } volume { - name = "data" + name = "tmp" empty_dir {} } diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml index 387a155ce..23fce7a4b 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml +++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml @@ -47,13 +47,13 @@ spec: volumeMounts: - mountPath: /dev/shm name: dshm - - mountPath: /data - name: data + - mountPath: /tmp + name: tmp volumes: - name: dshm emptyDir: medium: Memory - - name: data + - name: tmp hostPath: path: /mnt/stateful_partition/kube-ephemeral-ssd/mistral-data --- diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml index 46a1c9475..d551a5462 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml +++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml @@ -53,7 +53,7 @@ spec: memory: "42Gi" nvidia.com/gpu: "2" volumeMounts: - - mountPath: /data + - mountPath: /tmp name: ephemeral-volume - mountPath: /dev/shm name: dshm @@ -61,7 +61,7 @@ spec: - name: dshm emptyDir: medium: Memory - - name: data + - name: tmp hostPath: path: /mnt/stateful_partition/kube-ephemeral-ssd/mixtral-data - name: ephemeral-volume diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml index 462fce210..d486b19b6 100644 --- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml +++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml @@ -49,13 +49,13 @@ spec: volumeMounts: - mountPath: /dev/shm name: dshm - - mountPath: /data - name: data + - mountPath: /tmp + name: tmp volumes: - name: dshm emptyDir: medium: Memory - - name: data + - name: tmp hostPath: path: /mnt/stateful_partition/kube-ephemeral-ssd/llama-data nodeSelector: From 1cce0afcfb89bb1c5b40665a86fb407effd659cb Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:13:03 +0100 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Raushan Kumar --- modules/inference-service/main.tf | 3 +++ .../deploying-mistral-7b-instruct-L4gpus/README.md | 4 ++-- .../deploying-mixtral-8x7b-instruct-L4-gpus/README.md | 4 ++-- .../genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md | 6 +++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf index a6ce48819..8c999f24f 100644 --- a/modules/inference-service/main.tf +++ b/modules/inference-service/main.tf @@ -132,6 +132,9 @@ resource "kubernetes_deployment" "inference_deployment" { } volume_mount { + # mount_path is set to /tmp as it's the path where the HF_HOME environment + # variable points to i.e. where the downloaded model from the Hugging Face + # Hub will be stored mount_path = "/tmp" name = "tmp" } diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md index f40e5d1f8..49e79d387 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md @@ -115,14 +115,14 @@ Pod Template: PORT: 8080 QUANTIZE: bitsandbytes-nf4 Mounts: - /data from data (rw) + /tmp from tmp (rw) /dev/shm from dshm (rw) Volumes: dshm: Type: EmptyDir (a temporary directory that shares a pod's lifetime) Medium: Memory SizeLimit: - data: + tmp: Type: HostPath (bare host directory volume) Path: /mnt/stateful_partition/kube-ephemeral-ssd/mistral-data HostPathType: diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md index 089740edf..ca8822a90 100644 --- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md @@ -144,14 +144,14 @@ Pod Template: NUM_SHARD: 2 PORT: 8080 Mounts: - /data from ephemeral-volume (rw) + /tmp from ephemeral-volume (rw) /dev/shm from dshm (rw) Volumes: dshm: Type: EmptyDir (a temporary directory that shares a pod's lifetime) Medium: Memory SizeLimit: - data: + tmp: Type: HostPath (bare host directory volume) Path: /mnt/stateful_partition/kube-ephemeral-ssd/mixtral-data HostPathType: diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md index 4574a8a70..45701222d 100644 --- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md +++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md @@ -97,13 +97,13 @@ spec: volumeMounts: - mountPath: /dev/shm name: dshm - - mountPath: /data - name: data + - mountPath: /tmp + name: tmp volumes: - name: dshm emptyDir: medium: Memory - - name: data + - name: tmp hostPath: path: /mnt/stateful_partition/kube-ephemeral-ssd/llama-data nodeSelector: