Skip to content

Commit

Permalink
Update AutoScaling Tutorial to remove triton CLI (#108)
Browse files Browse the repository at this point in the history
* Update AutoScaling Blog to 24.08

* Triton CLI removed
  • Loading branch information
indrajit96 authored Oct 9, 2024
1 parent f59d968 commit 9d016f2
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 188 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: nfs-claim-autoscaling-2
spec:
accessModes:
- ReadWriteMany
storageClassName: efs-autoscaling-sc
resources:
requests:
storage: 200Gi
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: efs-autoscaling-pv-2
spec:
capacity:
storage: 200Gi
volumeMode: Filesystem
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
storageClassName: efs-autoscaling-sc
csi:
driver: efs.csi.aws.com
volumeHandle: fs-0c6ba87870e4be751
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: efs-autoscaling-sc
provisioner: efs.csi.aws.com
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ spec:
readOnly: false
- mountPath: /var/run/models
name: model-repository
readOnly: true
- mountPath: /var/run/cache
name: transformers-cache
readOnly: false
Expand Down Expand Up @@ -255,14 +254,12 @@ spec:
ephemeral-storage: 96Gi
nvidia.com/gpu: {{ $model_gpus }}
volumeMounts:
- mountPath: /var/run/engines
name: engine-repository
readOnly: false
- mountPath: /var/run/models
name: model-repository
readOnly: false
- mountPath: /var/run/cache
name: transformers-cache
- mountPath: /dev/shm
name: dshm
- mountPath: /var/run/engines
name: engine-repository
readOnly: false
{{- with $.Values.model }}
{{- if .pullSecret }}
Expand All @@ -288,9 +285,12 @@ spec:
path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
type: DirectoryOrCreate
- name: model-repository
hostPath:
path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
type: DirectoryOrCreate
persistentVolumeClaim:
claimName: nfs-claim-autoscaling-2
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 512Gi
{{- with $.Values.model }}
{{- with .pullSecret }}
- name: hf-secret
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# Any changes here must also be made there, and vice versa.
CACHE_DIRECTORY = "/var/run/cache"
HUGGING_FACE_TOKEN_PATH = "/var/run/secrets/hugging_face/password"
MODEL_DIRECTORY = "/var/run/models"
MODEL_DIRECTORY = "/var/run/models/tensorrtllm_backend/triton_model_repo"

ERROR_EXIT_DELAY = 15
ERROR_CODE_FATAL = 255
Expand Down Expand Up @@ -212,66 +212,6 @@ def execute_triton(args):
exit(result)


# ---


def initialize_model(args):
if args.model is None or len(args.model) == 0:
die("Model name must be provided.")

hugging_face_authenticate(args)

engine_path = os.path.join(ENGINE_DIRECTORY, args.model)
model_path = os.path.join(MODEL_DIRECTORY, args.model)

# When the model and plan already exist, we can exit early, happily.
if os.path.exists(engine_path) and os.path.exists(model_path):
write_output(
f"TensorRT engine and plan detected for {args.model}. No work to do, exiting."
)
exit(EXIT_SUCCESS)

write_output(f"Begin generation of TensorRT engine and plan for {args.model}.")
write_output(" ")

# Build up a set of args for the subprocess call.
cmd_args = [
"triton",
"import",
"--model",
args.model,
"--model-repository",
MODEL_DIRECTORY,
]

if args.engine == "vllm":
cmd_args += ["--backend", "vllm"]

else:
cmd_args += ["--backend", "tensorrtllm"]

if args.dt is not None and args.dt in ["bfloat", "float16", "float32"]:
cmd_args += ["--data-type", args.dt]

if args.pp > 1:
cmd_args += ["--pipeline-parallelism", f"{args.pp}"]

if args.tp > 1:
cmd_args += ["--tensor-parallelism", f"{args.tp}"]

# When verbose, insert the verbose flag.
# It is important to note that the flag must immediately follow `triton` and cannot be in another ordering position.
# This limitation will likely be removed a future release of triton_cli.
if is_verbose:
cmd_args.insert(1, "--verbose")

result = run_command(cmd_args)
exit(result)


# ---


def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("mode", type=str, choices=["exec", "init"])
Expand Down Expand Up @@ -302,31 +242,17 @@ def parse_arguments():
HUGGING_FACE_HOME = os.getenv(HUGGING_FACE_KEY)

is_verbose = os.getenv(CLI_VERBOSE_KEY) is not None

# Validate that `ENGINE_DIRECTORY` isn't empty.
if ENGINE_DIRECTORY is None or len(ENGINE_DIRECTORY) == 0:
raise Exception(f"Required environment variable '{ENGINE_PATH_KEY}' not set.")

# Validate that `ENGINE_DIRECTORY` actually exists.
if not os.path.exists(ENGINE_DIRECTORY):
raise Exception(f"Engine directory '{ENGINE_DIRECTORY}' does not exist.")

# Validate that `MODEL_DIRECTORY` actually exists.
if not os.path.exists(MODEL_DIRECTORY):
raise Exception(f"Model directory '{MODEL_DIRECTORY}' does not exist.")

# Parse options provided.
args = parse_arguments()

# Update the is_verbose flag with values passed in by options.
is_verbose = is_verbose or args.verbose > 0

if args.mode == "init":
initialize_model(args)
print("Hello, World!")
exit(EXIT_SUCCESS)

elif args.mode == "exec":
# Update the is_verbose flag with values passed in by options.
is_verbose = is_verbose or args.verbose > 0
execute_triton(args)

else:
write_error(f"usage: server.py <mode> [<options>].")
write_error(f' Invalid mode ("{args.mode}") provided.')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
ARG ENGINE_DEST_PATH=/var/run/engines
ARG HF_HOME=/var/run/cache

Expand All @@ -36,14 +36,6 @@ ENV HF_HOME ${HF_HOME}
# Set the active working directory.
WORKDIR /workspace

# Install a custom version of Triton CLI that support Tensor parallelism and
# the 70B version of Llama models.
RUN pip --verbose install \
--no-cache-dir \
--no-color \
--no-input \
git+https://github.com/triton-inference-server/triton_cli.git@jwyman/aslb-mn

# Copy the server script.
COPY server.py .

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: v1
kind: Pod
metadata:
name: setup-ssh-nfs
labels:
app: setup-ssh-nfs
spec:
containers:
- name: triton
image: nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
command: ["sleep", "infinity"]
resources:
limits:
nvidia.com/gpu: 4
requests:
nvidia.com/gpu: 4
volumeMounts:
- mountPath: /var/run/models
name: model-repository
- mountPath: /dev/shm
name: dshm
volumes:
- name: model-repository
persistentVolumeClaim:
claimName: nfs-claim-autoscaling-2
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 512Gi

0 comments on commit 9d016f2

Please sign in to comment.