From 4994c11918a8f3c3e80955f367596d841ac7e15f Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 18 Oct 2024 08:47:09 +0200
Subject: [PATCH 1/3] Update HuggingFace DLC for TGI URI to 2.3 (latest)

---
 modules/inference-service/main.tf                               | 2 +-
 .../genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md    | 2 +-
 .../deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml        | 2 +-
 .../genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md | 2 +-
 .../deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml   | 2 +-
 .../genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md           | 2 +-
 .../text-generation-interface.yaml                              | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
index 44655fc23..d15d608f8 100644
--- a/modules/inference-service/main.tf
+++ b/modules/inference-service/main.tf
@@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {
           }
         }
         container {
-          image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
+          image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311"
           name  = "mistral-7b-instruct"
 
           port {
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
index 6362c35a6..f40e5d1f8 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
@@ -104,7 +104,7 @@ Pod Template:
   Labels:  app=mistral-7b
   Containers:
    mistral-7b:
-    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
index 1828472a8..387a155ce 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
+++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: mistral-7b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
         resources:
           limits:
             nvidia.com/gpu: 1
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
index 60739ffc6..089740edf 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
@@ -127,7 +127,7 @@ Pod Template:
   Labels:  app=mixtral8x7b
   Containers:
    mixtral8x7b:
-    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
index 72a7e61d6..46a1c9475 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
+++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
@@ -30,7 +30,7 @@ spec:
         cloud.google.com/gke-accelerator: "nvidia-l4"
       containers:
       - name: mixtral8x7b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
         ports:
         - name: server-port
           containerPort: 8080
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
index 617e4072c..4574a8a70 100644
--- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
@@ -76,7 +76,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
         resources:
           limits:
             nvidia.com/gpu: 2
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
index a9963a719..462fce210 100644
--- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
+++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
         resources:
           limits:
             nvidia.com/gpu: 2

From bceb12a97fec8440fb3c5ca39e481eb64fb2d814 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 7 Jan 2025 09:50:49 +0100
Subject: [PATCH 2/3] Update `mountPath` to point to `/tmp` instead of `/data`

---
 modules/inference-service/main.tf                           | 6 +++---
 .../deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml    | 6 +++---
 .../mixtral-8x7b.yaml                                       | 4 ++--
 .../text-generation-interface.yaml                          | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
index d15d608f8..a6ce48819 100644
--- a/modules/inference-service/main.tf
+++ b/modules/inference-service/main.tf
@@ -132,8 +132,8 @@ resource "kubernetes_deployment" "inference_deployment" {
           }
 
           volume_mount {
-            mount_path = "/data"
-            name       = "data"
+            mount_path = "/tmp"
+            name       = "tmp"
           }
 
           volume_mount {
@@ -166,7 +166,7 @@ resource "kubernetes_deployment" "inference_deployment" {
         }
 
         volume {
-          name = "data"
+          name = "tmp"
           empty_dir {}
         }
 
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
index 387a155ce..23fce7a4b 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
+++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
@@ -47,13 +47,13 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
-            name: data
+          - mountPath: /tmp
+            name: tmp
       volumes:
          - name: dshm
            emptyDir:
               medium: Memory
-         - name: data
+         - name: tmp
            hostPath:
             path: /mnt/stateful_partition/kube-ephemeral-ssd/mistral-data
 ---
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
index 46a1c9475..d551a5462 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
+++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
@@ -53,7 +53,7 @@ spec:
             memory: "42Gi"
             nvidia.com/gpu: "2"
         volumeMounts:
-          - mountPath: /data
+          - mountPath: /tmp
             name: ephemeral-volume
           - mountPath: /dev/shm
             name: dshm
@@ -61,7 +61,7 @@ spec:
         - name: dshm
           emptyDir:
             medium: Memory
-        - name: data
+        - name: tmp
           hostPath:
             path: /mnt/stateful_partition/kube-ephemeral-ssd/mixtral-data
         - name: ephemeral-volume
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
index 462fce210..d486b19b6 100644
--- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
+++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
@@ -49,13 +49,13 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
-            name: data
+          - mountPath: /tmp
+            name: tmp
       volumes:
          - name: dshm
            emptyDir:
               medium: Memory
-         - name: data
+         - name: tmp
            hostPath:
             path: /mnt/stateful_partition/kube-ephemeral-ssd/llama-data
       nodeSelector:

From 1cce0afcfb89bb1c5b40665a86fb407effd659cb Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Wed, 8 Jan 2025 09:13:03 +0100
Subject: [PATCH 3/3] Apply suggestions from code review

Co-authored-by: Raushan Kumar <raushan2016@users.noreply.github.com>
---
 modules/inference-service/main.tf                           | 3 +++
 .../deploying-mistral-7b-instruct-L4gpus/README.md          | 4 ++--
 .../deploying-mixtral-8x7b-instruct-L4-gpus/README.md       | 4 ++--
 .../genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md       | 6 +++---
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
index a6ce48819..8c999f24f 100644
--- a/modules/inference-service/main.tf
+++ b/modules/inference-service/main.tf
@@ -132,6 +132,9 @@ resource "kubernetes_deployment" "inference_deployment" {
           }
 
           volume_mount {
+            # mount_path is set to /tmp as it's the path where the HF_HOME environment
+            # variable points to i.e. where the downloaded model from the Hugging Face
+            # Hub will be stored
             mount_path = "/tmp"
             name       = "tmp"
           }
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
index f40e5d1f8..49e79d387 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
@@ -115,14 +115,14 @@ Pod Template:
       PORT:       8080
       QUANTIZE:   bitsandbytes-nf4
     Mounts:
-      /data from data (rw)
+      /tmp from tmp (rw)
       /dev/shm from dshm (rw)
   Volumes:
    dshm:
     Type:       EmptyDir (a temporary directory that shares a pod's lifetime)
     Medium:     Memory
     SizeLimit:  <unset>
-   data:
+   tmp:
     Type:          HostPath (bare host directory volume)
     Path:          /mnt/stateful_partition/kube-ephemeral-ssd/mistral-data
     HostPathType:  
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
index 089740edf..ca8822a90 100644
--- a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
@@ -144,14 +144,14 @@ Pod Template:
       NUM_SHARD:  2
       PORT:       8080
     Mounts:
-      /data from ephemeral-volume (rw)
+      /tmp from ephemeral-volume (rw)
       /dev/shm from dshm (rw)
   Volumes:
    dshm:
     Type:       EmptyDir (a temporary directory that shares a pod's lifetime)
     Medium:     Memory
     SizeLimit:  <unset>
-   data:
+   tmp:
     Type:          HostPath (bare host directory volume)
     Path:          /mnt/stateful_partition/kube-ephemeral-ssd/mixtral-data
     HostPathType:  
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
index 4574a8a70..45701222d 100644
--- a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
+++ b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
@@ -97,13 +97,13 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
-            name: data
+          - mountPath: /tmp
+            name: tmp
       volumes:
          - name: dshm
            emptyDir:
               medium: Memory
-         - name: data
+         - name: tmp
            hostPath:
             path: /mnt/stateful_partition/kube-ephemeral-ssd/llama-data
       nodeSelector: