From 5ba440def46425d392dedbb9fa988d1671e82f62 Mon Sep 17 00:00:00 2001 From: blaise Date: Sat, 20 Jan 2024 02:18:38 +0000 Subject: [PATCH] addressing pr feedback --- Dockerfile | 1 + app/core/edge_inference.py | 6 +- app/core/file_paths.py | 4 + deploy/bin/cluster_setup.sh | 86 ++++++++++++++++--- deploy/bin/setup_db.sh | 29 +++---- .../k3s/edge_deployment/edge_deployment.yaml | 29 ++++++- ...volume.yaml => efs_persistent_volume.yaml} | 0 deploy/k3s/local_persistent_volume.yaml | 25 ++++++ deploy/k3s/service_account.yaml | 8 +- test-edge-endpoint.py | 10 +++ 10 files changed, 158 insertions(+), 40 deletions(-) rename deploy/k3s/{persistentvolume.yaml => efs_persistent_volume.yaml} (100%) create mode 100644 deploy/k3s/local_persistent_volume.yaml create mode 100644 test-edge-endpoint.py diff --git a/Dockerfile b/Dockerfile index bdff0261..5ba70e46 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ RUN apt-get update \ nginx \ libglib2.0-0 \ libgl1-mesa-glx \ + sqlite3 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && curl -sSL https://install.python-poetry.org | python - diff --git a/app/core/edge_inference.py b/app/core/edge_inference.py index ec40540a..ad743d14 100644 --- a/app/core/edge_inference.py +++ b/app/core/edge_inference.py @@ -12,6 +12,7 @@ from jinja2 import Template from app.core.utils import prefixed_ksuid +from app.core.file_paths import MODEL_REPOSITORY_PATH from .configs import LocalInferenceConfig @@ -22,7 +23,6 @@ class EdgeInferenceManager: INPUT_IMAGE_NAME = "image" MODEL_OUTPUTS = ["score", "confidence", "probability", "label"] INFERENCE_SERVER_URL = "inference-service:8000" - MODEL_REPOSITORY = "/mnt/models" def __init__(self, config: Dict[str, LocalInferenceConfig] | None, verbose: bool = False) -> None: """ @@ -164,7 +164,7 @@ def update_model(self, detector_id: str) -> bool: if cloud_binary_ksuid is None: logger.warning(f"No model binary ksuid returned for {detector_id}") - model_dir = os.path.join(self.MODEL_REPOSITORY, detector_id) + model_dir = os.path.join(MODEL_REPOSITORY_PATH, detector_id) edge_binary_ksuid = get_current_model_ksuid(model_dir) if edge_binary_ksuid and cloud_binary_ksuid is not None and cloud_binary_ksuid <= edge_binary_ksuid: logger.info(f"No new model available for {detector_id}") @@ -180,7 +180,7 @@ def update_model(self, detector_id: str) -> bool: model_buffer, pipeline_config, binary_ksuid=cloud_binary_ksuid, - repository_root=self.MODEL_REPOSITORY, + repository_root=MODEL_REPOSITORY_PATH, ) return True diff --git a/app/core/file_paths.py b/app/core/file_paths.py index a91215b3..caad5f1d 100644 --- a/app/core/file_paths.py +++ b/app/core/file_paths.py @@ -1,12 +1,16 @@ DEFAULT_EDGE_CONFIG_PATH = "/etc/groundlight/edge-config/edge-config.yaml" INFERENCE_DEPLOYMENT_TEMPLATE_PATH = "/etc/groundlight/inference-deployment/inference_deployment_template.yaml" +# A file with the namespace to be operating in KUBERNETES_NAMESPACE_PATH = "/etc/groundlight/kubernetes-namespace/namespace" # Path to the database file. # This must also match the path used in the PersistentVolumeClaim definition for the database. DATABASE_FILEPATH = "/opt/groundlight/edge/sqlite/sqlite.db" +# Path to the model repository. +MODEL_REPOSITORY_PATH = "/opt/groundlight/edge/serving/model-repo" + # Path to the database log file. This will contain all SQL queries executed by the ORM. DATABASE_ORM_LOG_FILE = "sqlalchemy.log" DATABASE_ORM_LOG_FILE_SIZE = 10_000_000 # 10 MB diff --git a/deploy/bin/cluster_setup.sh b/deploy/bin/cluster_setup.sh index ded82ceb..43a85372 100755 --- a/deploy/bin/cluster_setup.sh +++ b/deploy/bin/cluster_setup.sh @@ -8,13 +8,16 @@ # - detectors in the `inference_deployments` table # - image queries in the `image_queries_edge` table # For more on these tables you can examine the database file at -# /opt/groundlight/edge/sqlite/sqlite.db on the attached EFS volume. +# /opt/groundlight/edge/sqlite/sqlite.db on the attached volume (EFS/local). # Possible env vars: # - KUBECTL_CMD: path to kubectl command. Defaults to "kubectl" but can be set to "k3s kubectl" if using k3s # - INFERENCE_FLAVOR: "CPU" or "GPU". Defaults to "GPU" # - EDGE_CONFIG: contents of edge-config.yaml. If not set, will use configs/edge-config.yaml -# - EFS_VOLUME_ID: ID of the EFS volume to use if the PV and PVC don't exist yet. +# - DEPLOY_LOCAL_VERSION: Indicates whether we are building the local version of the edge endpoint. +# If set to 0, we will attach an EFS instead of a local volume. Defaults to 1. +# - EFS_VOLUME_ID: ID of the EFS volume to use if we are using the EFS version. + # move to the root directory of the repo @@ -27,19 +30,39 @@ fail() { exit 1 } +# Function to check for conflicting PV. +# This is a robustness measure to guard against errors when a user tries to create a +# persistent volume with hostPath when we already have an EFS volume mounted or vice versa. +check_pv_conflict() { + local pv_name=$1 + local expected_storage_class=$2 + + # Get existing PV details if it exists + pv_detail=$(kubectl get pv "$pv_name" -o json 2>/dev/null) + if [[ -z "$pv_detail" ]]; then + # PV does not exist, no conflict + return 0 + fi + + # Extract storage class and host path from existing PV + existing_storage_class=$(echo "$pv_detail" | jq -r '.spec.storageClassName') + + # Compare existing PV details with expected details + if [[ "$existing_storage_class" != "$expected_storage_class" ]]; then + echo "Alert: Existing PersistentVolume '$pv_name' conflicts with the anticipated resource." + echo "Existing storage class: $existing_storage_class, Expected: $expected_storage_class" + echo "Consider deleting the existing PV/PVC and try again." + return 1 + fi + return 0 +} + K=${KUBECTL_CMD:-kubectl} INFERENCE_FLAVOR=${INFERENCE_FLAVOR:-"GPU"} -DB_RESTART=$1 +DB_RESET=$1 +DEPLOY_LOCAL_VERSION=${DEPLOY_LOCAL_VERSION:-1} -# Ensure database file has been correctly setup. If the first argument is "db_reset", -# all the data in the database will be deleted first. -# For now, this means all -# - detectors in the `inference_deployments` table -# - image queries in the `image_queries_edge` table -# For more on these tables you can examine the database file at -# /opt/groundlight/edge/sqlite/sqlite.db -./deploy/bin/setup_db.sh $DB_RESTART # Secrets ./deploy/bin/make-aws-secret.sh @@ -54,6 +77,8 @@ fi $K delete configmap --ignore-not-found edge-config $K delete configmap --ignore-not-found inference-deployment-template $K delete configmap --ignore-not-found kubernetes-namespace +$K delete configmap --ignore-not-found setup-db +$K delete configmap --ignore-not-found db-reset if [[ -n "${EDGE_CONFIG}" ]]; then echo "Creating config from EDGE_CONFIG env var" @@ -81,6 +106,15 @@ fi DEPLOYMENT_NAMESPACE=$($K config view -o json | jq -r '.contexts[] | select(.name == "'$(kubectl config current-context)'") | .context.namespace') $K create configmap kubernetes-namespace --from-literal=namespace=${DEPLOYMENT_NAMESPACE} +$K create configmap setup-db --from-file=$(pwd)/deploy/bin/setup_db.sh -n ${DEPLOYMENT_NAMESPACE} + +# If db_reset is passed as an argument, create a environment variable DB_RESET +if [[ "$DB_RESET" == "db_reset" ]]; then + $K create configmap db-reset --from-literal=DB_RESET=1 +else + $K create configmap db-reset --from-literal=DB_RESET=0 +fi + # Clean up existing deployments and services (if they exist) $K delete --ignore-not-found deployment edge-endpoint $K delete --ignore-not-found service edge-endpoint-service @@ -93,6 +127,30 @@ $K get service -o custom-columns=":metadata.name" --no-headers=true | \ # Reapply changes +# Check if DEPLOY_LOCAL_VERSION is set. If so, use a local volume instead of an EFS volume +if [[ "${DEPLOY_LOCAL_VERSION}" == "1" ]]; then + if ! check_pv_conflict "edge-endpoint-pv" "local-sc"; then + fail "PersistentVolume edge-endpoint-pv conflicts with the existing resource." + fi + + $K apply -f deploy/k3s/local_persistent_volume.yaml +else + # If environment variable EFS_VOLUME_ID is not set, exit + if [[ -z "${EFS_VOLUME_ID}" ]]; then + fail "EFS_VOLUME_ID environment variable not set" + fi + + if ! check_pv_conflict "edge-endpoint-pv" "efs-sc"; then + fail "PersistentVolume edge-endpoint-pv conflicts with the existing resource." + fi + + # Use envsubst to replace the EFS_VOLUME_ID in the persistentvolumeclaim.yaml template + envsubst < deploy/k3s/efs_persistent_volume.yaml > deploy/k3s/persistentvolume.yaml + $K apply -f deploy/k3s/persistentvolume.yaml + rm deploy/k3s/persistentvolume.yaml +fi + + # Check if the edge-endpoint-pvc exists. If not, create it if ! $K get pvc edge-endpoint-pvc; then # If environment variable EFS_VOLUME_ID is not set, exit @@ -105,7 +163,11 @@ if ! $K get pvc edge-endpoint-pvc; then rm deploy/k3s/persistentvolume.yaml.tmp fi -$K apply -f deploy/k3s/service_account.yaml +# Substitutes the namespace in the service_account.yaml template +envsubst < deploy/k3s/service_account.yaml > deploy/k3s/service_account.yaml.tmp +$K apply -f deploy/k3s/service_account.yaml.tmp +rm deploy/k3s/service_account.yaml.tmp + $K apply -f deploy/k3s/edge_deployment/edge_deployment.yaml $K describe deployment edge-endpoint \ No newline at end of file diff --git a/deploy/bin/setup_db.sh b/deploy/bin/setup_db.sh index d9102869..e84ebe5b 100755 --- a/deploy/bin/setup_db.sh +++ b/deploy/bin/setup_db.sh @@ -1,9 +1,14 @@ #!/bin/bash -set -ex +# Sets up the SQLite database for the edge endpoint. +# Expects the following environment variables: +# - DB_RESET: If set to 1, will delete all the data in the database. +# +# This script is invoked by an initContainer in the edge endpoint deployment. -cd "$(dirname "$0")" +set -ex +DB_RESET=${DB_RESET:-0} DATABASE_DIRECTORY="/opt/groundlight/edge/sqlite" DATABASE_PATH="${DATABASE_DIRECTORY}/sqlite.db" @@ -24,17 +29,7 @@ reset_tables() { done } - -# Check if we have sqlite3 CLI installed. If not, install it -if [ ! -x "/usr/bin/sqlite3" ]; then - echo "sqlite3 could not be found. Installing it now..." - sudo apt-get update - sudo apt install -y sqlite3 - -else - echo "sqlite3 is already installed." - which sqlite3 -fi +echo "Using sqlite3 from $(which sqlite3)" # If the database already exists, exit. Otherwise, create it @@ -51,15 +46,13 @@ else echo "${ENTRY_QUERY}" | sqlite3 "${DATABASE_PATH}" echo "${DROP_QUERY}" | sqlite3 "${DATABASE_PATH}" - # Set journal model to Write-Ahead Logging. This makes it much faster, at the risk of + # Set journal mode to Write-Ahead Logging. This makes it much faster, at the risk of # possibly losing data if the machine crashes suddenly. # https://www.sqlite.org/wal.html echo "PRAGMA journal_mode=WAL;" | sqlite3 "${DATABASE_PATH}" fi - -# Reset tables if the first argument is "db_reset" -if [ "$1" == "db_reset" ]; then - echo "Resetting database tables..." +if [[ "${DB_RESET}" == "1" ]]; then + echo "Resetting tables..." reset_tables fi \ No newline at end of file diff --git a/deploy/k3s/edge_deployment/edge_deployment.yaml b/deploy/k3s/edge_deployment/edge_deployment.yaml index 85dcb57f..82db3c30 100644 --- a/deploy/k3s/edge_deployment/edge_deployment.yaml +++ b/deploy/k3s/edge_deployment/edge_deployment.yaml @@ -38,9 +38,28 @@ spec: # This service account is used by the edge logic to access the Kubernetes API # from within the pod. See deploy/k3s/service_account.yaml for more details serviceAccountName: edge-endpoint-service-account + initContainers: + - name: database-prep + image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:miscellaneous-c290a1810-dirty-8c48a5f159e9542 + imagePullPolicy: IfNotPresent + env: + # Flag to indicate whether or not to reset all database tables. Resetting WILL delete + # all existing data in the database, so set this flag to 1 with caution. + - name: DB_RESET + valueFrom: + configMapKeyRef: + name: db-reset + key: DB_RESET + volumeMounts: + - name: edge-endpoint-persistent-volume + mountPath: /opt/groundlight/edge/sqlite + - name: sqlite-db-setup-script-volume + mountPath: /scripts + command: ["/bin/bash", "/scripts/setup-db.sh"] + containers: - name: edge-endpoint - image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:3e7357529-miscellaneous + image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:miscellaneous-c290a1810-dirty-8c48a5f159e9542 imagePullPolicy: IfNotPresent ports: - containerPort: 6717 @@ -64,7 +83,7 @@ spec: mountPath: /opt/groundlight/edge/sqlite - name: inference-model-updater - image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:3e7357529-miscellaneous + image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:miscellaneous-c290a1810-dirty-8c48a5f159e9542 imagePullPolicy: IfNotPresent command: ["/bin/bash", "-c"] args: ["poetry run python -m app.model_updater.update_models"] @@ -91,10 +110,11 @@ spec: mountPath: /opt/groundlight/edge/sqlite - name: edge-endpoint-persistent-volume - mountPath: /var/groundlight/serving/model_repository + mountPath: /opt/groundlight/edge/serving/model-repo imagePullSecrets: - name: registry-credentials + volumes: - name: edge-config-volume configMap: @@ -105,6 +125,9 @@ spec: - name: inference-deployment-template-volume configMap: name: inference-deployment-template + - name: sqlite-db-setup-script-volume + configMap: + name: setup-db - name: edge-endpoint-persistent-volume persistentVolumeClaim: # This PVC is attached to an EFS volume. diff --git a/deploy/k3s/persistentvolume.yaml b/deploy/k3s/efs_persistent_volume.yaml similarity index 100% rename from deploy/k3s/persistentvolume.yaml rename to deploy/k3s/efs_persistent_volume.yaml diff --git a/deploy/k3s/local_persistent_volume.yaml b/deploy/k3s/local_persistent_volume.yaml new file mode 100644 index 00000000..d859191e --- /dev/null +++ b/deploy/k3s/local_persistent_volume.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: edge-endpoint-pv +spec: + capacity: + storage: 2Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: local-sc + hostPath: + path: /opt/groundlight/edge +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: edge-endpoint-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + storageClassName: local-sc \ No newline at end of file diff --git a/deploy/k3s/service_account.yaml b/deploy/k3s/service_account.yaml index e3cf8e2f..85b89044 100644 --- a/deploy/k3s/service_account.yaml +++ b/deploy/k3s/service_account.yaml @@ -16,12 +16,12 @@ apiVersion: v1 kind: ServiceAccount metadata: name: edge-endpoint-service-account - namespace: agot-edge-dev1 + namespace: ${DEPLOYMENT_NAMESPACE} --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - namespace: agot-edge-dev1 + namespace: ${DEPLOYMENT_NAMESPACE} name: limited-access-role rules: # We only need to allow access to the resources that the edge logic needs @@ -39,11 +39,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: edge-endpoint-role-binding - namespace: agot-edge-dev1 + namespace: ${DEPLOYMENT_NAMESPACE} subjects: - kind: ServiceAccount name: edge-endpoint-service-account - namespace: agot-edge-dev1 + namespace: ${DEPLOYMENT_NAMESPACE} roleRef: kind: Role name: limited-access-role diff --git a/test-edge-endpoint.py b/test-edge-endpoint.py new file mode 100644 index 00000000..efd4471d --- /dev/null +++ b/test-edge-endpoint.py @@ -0,0 +1,10 @@ +from groundlight import Groundlight + +def main(): + # For the edge, we don't actually need an API token here. Just the endpoint + sdk_client = Groundlight(endpoint="http://") + + + +if __name__ == "__main__": + main() \ No newline at end of file