Update AutoScaling Tutorial to remove triton CLI (#108)

* Update AutoScaling Blog to 24.08 * Triton CLI removed
triton-inference-server · Oct 9, 2024 · 9d016f2 · 9d016f2
1 parent f59d968
commit 9d016f2
Show file tree

Hide file tree

Showing 8 changed files with 248 additions and 188 deletions.
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md
diff --git a/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/claim_aws.yaml b/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/claim_aws.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: nfs-claim-autoscaling-2
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: efs-autoscaling-sc
+  resources:
+    requests:
+      storage: 200Gi
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/pv_aws.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/pv_aws.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efs-autoscaling-pv-2
+spec:
+  capacity:
+    storage: 200Gi
+  volumeMode: Filesystem
+  accessModes:
+    - ReadWriteMany
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: efs-autoscaling-sc
+  csi:
+    driver: efs.csi.aws.com
+    volumeHandle: fs-0c6ba87870e4be751
diff --git a/...ubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/storageclass_aws.yaml b/...ubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/storageclass_aws.yaml
@@ -0,0 +1,5 @@
+kind: StorageClass
+apiVersion: storage.k8s.io/v1
+metadata:
+  name: efs-autoscaling-sc
+provisioner: efs.csi.aws.com
diff --git a/...nt/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml b/...nt/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml
@@ -190,7 +190,6 @@ spec:
           readOnly: false
         - mountPath: /var/run/models
           name: model-repository
-          readOnly: true
         - mountPath: /var/run/cache
           name: transformers-cache
           readOnly: false
@@ -255,14 +254,12 @@ spec:
             ephemeral-storage: 96Gi
             nvidia.com/gpu: {{ $model_gpus }}
         volumeMounts:
-        - mountPath: /var/run/engines
-          name: engine-repository
-          readOnly: false
         - mountPath: /var/run/models
           name: model-repository
-          readOnly: false
-        - mountPath: /var/run/cache
-          name: transformers-cache
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /var/run/engines
+          name: engine-repository
           readOnly: false
 {{- with $.Values.model }}
 {{-   if .pullSecret }}
@@ -288,9 +285,12 @@ spec:
           path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
           type: DirectoryOrCreate
       - name: model-repository
-        hostPath:
-          path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
-          type: DirectoryOrCreate
+        persistentVolumeClaim:
+          claimName: nfs-claim-autoscaling-2
+      - name: dshm
+        emptyDir:
+          medium: Memory
+          sizeLimit: 512Gi
 {{- with $.Values.model }}
 {{-   with .pullSecret }}
       - name: hf-secret

diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py
@@ -23,7 +23,7 @@
 # Any changes here must also be made there, and vice versa.
 CACHE_DIRECTORY = "/var/run/cache"
 HUGGING_FACE_TOKEN_PATH = "/var/run/secrets/hugging_face/password"
-MODEL_DIRECTORY = "/var/run/models"
+MODEL_DIRECTORY = "/var/run/models/tensorrtllm_backend/triton_model_repo"
 
 ERROR_EXIT_DELAY = 15
 ERROR_CODE_FATAL = 255
@@ -212,66 +212,6 @@ def execute_triton(args):
     exit(result)
 
 
-# ---
-
-
-def initialize_model(args):
-    if args.model is None or len(args.model) == 0:
-        die("Model name must be provided.")
-
-    hugging_face_authenticate(args)
-
-    engine_path = os.path.join(ENGINE_DIRECTORY, args.model)
-    model_path = os.path.join(MODEL_DIRECTORY, args.model)
-
-    # When the model and plan already exist, we can exit early, happily.
-    if os.path.exists(engine_path) and os.path.exists(model_path):
-        write_output(
-            f"TensorRT engine and plan detected for {args.model}. No work to do, exiting."
-        )
-        exit(EXIT_SUCCESS)
-
-    write_output(f"Begin generation of TensorRT engine and plan for {args.model}.")
-    write_output(" ")
-
-    # Build up a set of args for the subprocess call.
-    cmd_args = [
-        "triton",
-        "import",
-        "--model",
-        args.model,
-        "--model-repository",
-        MODEL_DIRECTORY,
-    ]
-
-    if args.engine == "vllm":
-        cmd_args += ["--backend", "vllm"]
-
-    else:
-        cmd_args += ["--backend", "tensorrtllm"]
-
-        if args.dt is not None and args.dt in ["bfloat", "float16", "float32"]:
-            cmd_args += ["--data-type", args.dt]
-
-        if args.pp > 1:
-            cmd_args += ["--pipeline-parallelism", f"{args.pp}"]
-
-        if args.tp > 1:
-            cmd_args += ["--tensor-parallelism", f"{args.tp}"]
-
-    # When verbose, insert the verbose flag.
-    # It is important to note that the flag must immediately follow `triton` and cannot be in another ordering position.
-    # This limitation will likely be removed a future release of triton_cli.
-    if is_verbose:
-        cmd_args.insert(1, "--verbose")
-
-    result = run_command(cmd_args)
-    exit(result)
-
-
-# ---
-
-
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument("mode", type=str, choices=["exec", "init"])
@@ -302,31 +242,17 @@ def parse_arguments():
     HUGGING_FACE_HOME = os.getenv(HUGGING_FACE_KEY)
 
     is_verbose = os.getenv(CLI_VERBOSE_KEY) is not None
-
-    # Validate that `ENGINE_DIRECTORY` isn't empty.
-    if ENGINE_DIRECTORY is None or len(ENGINE_DIRECTORY) == 0:
-        raise Exception(f"Required environment variable '{ENGINE_PATH_KEY}' not set.")
-
-    # Validate that `ENGINE_DIRECTORY` actually exists.
-    if not os.path.exists(ENGINE_DIRECTORY):
-        raise Exception(f"Engine directory '{ENGINE_DIRECTORY}' does not exist.")
-
-    # Validate that `MODEL_DIRECTORY` actually exists.
-    if not os.path.exists(MODEL_DIRECTORY):
-        raise Exception(f"Model directory '{MODEL_DIRECTORY}' does not exist.")
-
     # Parse options provided.
     args = parse_arguments()
 
-    # Update the is_verbose flag with values passed in by options.
-    is_verbose = is_verbose or args.verbose > 0
-
     if args.mode == "init":
-        initialize_model(args)
+        print("Hello, World!")
+        exit(EXIT_SUCCESS)
 
     elif args.mode == "exec":
+        # Update the is_verbose flag with values passed in by options.
+        is_verbose = is_verbose or args.verbose > 0
         execute_triton(args)
-
     else:
         write_error(f"usage: server.py <mode> [<options>].")
         write_error(f'       Invalid mode ("{args.mode}") provided.')

diff --git a/...netes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/triton_trt-llm.containerfile b/...netes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/triton_trt-llm.containerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
 ARG ENGINE_DEST_PATH=/var/run/engines
 ARG HF_HOME=/var/run/cache
 
@@ -36,14 +36,6 @@ ENV HF_HOME ${HF_HOME}
 # Set the active working directory.
 WORKDIR /workspace
 
-# Install a custom version of Triton CLI that support Tensor parallelism and
-# the 70B version of Llama models.
-RUN pip --verbose install \
-    --no-cache-dir \
-    --no-color \
-    --no-input \
-    git+https://github.com/triton-inference-server/triton_cli.git@jwyman/aslb-mn
-
 # Copy the server script.
 COPY server.py .
 

diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/setup_ssh-nfs.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/setup_ssh-nfs.yaml
@@ -0,0 +1,29 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: setup-ssh-nfs
+  labels:
+    app: setup-ssh-nfs
+spec:
+  containers:
+  - name: triton
+    image: nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
+    command: ["sleep", "infinity"]
+    resources:
+      limits:
+        nvidia.com/gpu: 4
+      requests:
+        nvidia.com/gpu: 4
+    volumeMounts:
+      - mountPath: /var/run/models
+        name: model-repository
+      - mountPath: /dev/shm
+        name: dshm
+  volumes:
+    - name: model-repository
+      persistentVolumeClaim:
+        claimName: nfs-claim-autoscaling-2
+    - name: dshm
+      emptyDir:
+        medium: Memory
+        sizeLimit: 512Gi