Skip to content

Commit

Permalink
addressing pr feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
blaise-muhirwa committed Jan 20, 2024
1 parent c290a18 commit 5ba440d
Show file tree
Hide file tree
Showing 10 changed files with 158 additions and 40 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ RUN apt-get update \
nginx \
libglib2.0-0 \
libgl1-mesa-glx \
sqlite3 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& curl -sSL https://install.python-poetry.org | python -
Expand Down
6 changes: 3 additions & 3 deletions app/core/edge_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from jinja2 import Template

from app.core.utils import prefixed_ksuid
from app.core.file_paths import MODEL_REPOSITORY_PATH

from .configs import LocalInferenceConfig

Expand All @@ -22,7 +23,6 @@ class EdgeInferenceManager:
INPUT_IMAGE_NAME = "image"
MODEL_OUTPUTS = ["score", "confidence", "probability", "label"]
INFERENCE_SERVER_URL = "inference-service:8000"
MODEL_REPOSITORY = "/mnt/models"

def __init__(self, config: Dict[str, LocalInferenceConfig] | None, verbose: bool = False) -> None:
"""
Expand Down Expand Up @@ -164,7 +164,7 @@ def update_model(self, detector_id: str) -> bool:
if cloud_binary_ksuid is None:
logger.warning(f"No model binary ksuid returned for {detector_id}")

model_dir = os.path.join(self.MODEL_REPOSITORY, detector_id)
model_dir = os.path.join(MODEL_REPOSITORY_PATH, detector_id)
edge_binary_ksuid = get_current_model_ksuid(model_dir)
if edge_binary_ksuid and cloud_binary_ksuid is not None and cloud_binary_ksuid <= edge_binary_ksuid:
logger.info(f"No new model available for {detector_id}")
Expand All @@ -180,7 +180,7 @@ def update_model(self, detector_id: str) -> bool:
model_buffer,
pipeline_config,
binary_ksuid=cloud_binary_ksuid,
repository_root=self.MODEL_REPOSITORY,
repository_root=MODEL_REPOSITORY_PATH,
)
return True

Expand Down
4 changes: 4 additions & 0 deletions app/core/file_paths.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
DEFAULT_EDGE_CONFIG_PATH = "/etc/groundlight/edge-config/edge-config.yaml"
INFERENCE_DEPLOYMENT_TEMPLATE_PATH = "/etc/groundlight/inference-deployment/inference_deployment_template.yaml"

# A file with the namespace to be operating in
KUBERNETES_NAMESPACE_PATH = "/etc/groundlight/kubernetes-namespace/namespace"

# Path to the database file.
# This must also match the path used in the PersistentVolumeClaim definition for the database.
DATABASE_FILEPATH = "/opt/groundlight/edge/sqlite/sqlite.db"

# Path to the model repository.
MODEL_REPOSITORY_PATH = "/opt/groundlight/edge/serving/model-repo"

# Path to the database log file. This will contain all SQL queries executed by the ORM.
DATABASE_ORM_LOG_FILE = "sqlalchemy.log"
DATABASE_ORM_LOG_FILE_SIZE = 10_000_000 # 10 MB
86 changes: 74 additions & 12 deletions deploy/bin/cluster_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@
# - detectors in the `inference_deployments` table
# - image queries in the `image_queries_edge` table
# For more on these tables you can examine the database file at
# /opt/groundlight/edge/sqlite/sqlite.db on the attached EFS volume.
# /opt/groundlight/edge/sqlite/sqlite.db on the attached volume (EFS/local).

# Possible env vars:
# - KUBECTL_CMD: path to kubectl command. Defaults to "kubectl" but can be set to "k3s kubectl" if using k3s
# - INFERENCE_FLAVOR: "CPU" or "GPU". Defaults to "GPU"
# - EDGE_CONFIG: contents of edge-config.yaml. If not set, will use configs/edge-config.yaml
# - EFS_VOLUME_ID: ID of the EFS volume to use if the PV and PVC don't exist yet.
# - DEPLOY_LOCAL_VERSION: Indicates whether we are building the local version of the edge endpoint.
# If set to 0, we will attach an EFS instead of a local volume. Defaults to 1.
# - EFS_VOLUME_ID: ID of the EFS volume to use if we are using the EFS version.



# move to the root directory of the repo
Expand All @@ -27,19 +30,39 @@ fail() {
exit 1
}

# Function to check for conflicting PV.
# This is a robustness measure to guard against errors when a user tries to create a
# persistent volume with hostPath when we already have an EFS volume mounted or vice versa.
check_pv_conflict() {
local pv_name=$1
local expected_storage_class=$2

# Get existing PV details if it exists
pv_detail=$(kubectl get pv "$pv_name" -o json 2>/dev/null)
if [[ -z "$pv_detail" ]]; then
# PV does not exist, no conflict
return 0
fi

# Extract storage class and host path from existing PV
existing_storage_class=$(echo "$pv_detail" | jq -r '.spec.storageClassName')

# Compare existing PV details with expected details
if [[ "$existing_storage_class" != "$expected_storage_class" ]]; then
echo "Alert: Existing PersistentVolume '$pv_name' conflicts with the anticipated resource."
echo "Existing storage class: $existing_storage_class, Expected: $expected_storage_class"
echo "Consider deleting the existing PV/PVC and try again."
return 1
fi
return 0
}


K=${KUBECTL_CMD:-kubectl}
INFERENCE_FLAVOR=${INFERENCE_FLAVOR:-"GPU"}
DB_RESTART=$1
DB_RESET=$1
DEPLOY_LOCAL_VERSION=${DEPLOY_LOCAL_VERSION:-1}

# Ensure database file has been correctly setup. If the first argument is "db_reset",
# all the data in the database will be deleted first.
# For now, this means all
# - detectors in the `inference_deployments` table
# - image queries in the `image_queries_edge` table
# For more on these tables you can examine the database file at
# /opt/groundlight/edge/sqlite/sqlite.db
./deploy/bin/setup_db.sh $DB_RESTART

# Secrets
./deploy/bin/make-aws-secret.sh
Expand All @@ -54,6 +77,8 @@ fi
$K delete configmap --ignore-not-found edge-config
$K delete configmap --ignore-not-found inference-deployment-template
$K delete configmap --ignore-not-found kubernetes-namespace
$K delete configmap --ignore-not-found setup-db
$K delete configmap --ignore-not-found db-reset

if [[ -n "${EDGE_CONFIG}" ]]; then
echo "Creating config from EDGE_CONFIG env var"
Expand Down Expand Up @@ -81,6 +106,15 @@ fi
DEPLOYMENT_NAMESPACE=$($K config view -o json | jq -r '.contexts[] | select(.name == "'$(kubectl config current-context)'") | .context.namespace')
$K create configmap kubernetes-namespace --from-literal=namespace=${DEPLOYMENT_NAMESPACE}

$K create configmap setup-db --from-file=$(pwd)/deploy/bin/setup_db.sh -n ${DEPLOYMENT_NAMESPACE}

# If db_reset is passed as an argument, create a environment variable DB_RESET
if [[ "$DB_RESET" == "db_reset" ]]; then
$K create configmap db-reset --from-literal=DB_RESET=1
else
$K create configmap db-reset --from-literal=DB_RESET=0
fi

# Clean up existing deployments and services (if they exist)
$K delete --ignore-not-found deployment edge-endpoint
$K delete --ignore-not-found service edge-endpoint-service
Expand All @@ -93,6 +127,30 @@ $K get service -o custom-columns=":metadata.name" --no-headers=true | \

# Reapply changes

# Check if DEPLOY_LOCAL_VERSION is set. If so, use a local volume instead of an EFS volume
if [[ "${DEPLOY_LOCAL_VERSION}" == "1" ]]; then
if ! check_pv_conflict "edge-endpoint-pv" "local-sc"; then
fail "PersistentVolume edge-endpoint-pv conflicts with the existing resource."
fi

$K apply -f deploy/k3s/local_persistent_volume.yaml
else
# If environment variable EFS_VOLUME_ID is not set, exit
if [[ -z "${EFS_VOLUME_ID}" ]]; then
fail "EFS_VOLUME_ID environment variable not set"
fi

if ! check_pv_conflict "edge-endpoint-pv" "efs-sc"; then
fail "PersistentVolume edge-endpoint-pv conflicts with the existing resource."
fi

# Use envsubst to replace the EFS_VOLUME_ID in the persistentvolumeclaim.yaml template
envsubst < deploy/k3s/efs_persistent_volume.yaml > deploy/k3s/persistentvolume.yaml
$K apply -f deploy/k3s/persistentvolume.yaml
rm deploy/k3s/persistentvolume.yaml
fi


# Check if the edge-endpoint-pvc exists. If not, create it
if ! $K get pvc edge-endpoint-pvc; then
# If environment variable EFS_VOLUME_ID is not set, exit
Expand All @@ -105,7 +163,11 @@ if ! $K get pvc edge-endpoint-pvc; then
rm deploy/k3s/persistentvolume.yaml.tmp
fi

$K apply -f deploy/k3s/service_account.yaml
# Substitutes the namespace in the service_account.yaml template
envsubst < deploy/k3s/service_account.yaml > deploy/k3s/service_account.yaml.tmp
$K apply -f deploy/k3s/service_account.yaml.tmp
rm deploy/k3s/service_account.yaml.tmp

$K apply -f deploy/k3s/edge_deployment/edge_deployment.yaml

$K describe deployment edge-endpoint
29 changes: 11 additions & 18 deletions deploy/bin/setup_db.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
#!/bin/bash

set -ex
# Sets up the SQLite database for the edge endpoint.
# Expects the following environment variables:
# - DB_RESET: If set to 1, will delete all the data in the database.
#
# This script is invoked by an initContainer in the edge endpoint deployment.

cd "$(dirname "$0")"
set -ex

DB_RESET=${DB_RESET:-0}

DATABASE_DIRECTORY="/opt/groundlight/edge/sqlite"
DATABASE_PATH="${DATABASE_DIRECTORY}/sqlite.db"
Expand All @@ -24,17 +29,7 @@ reset_tables() {
done
}


# Check if we have sqlite3 CLI installed. If not, install it
if [ ! -x "/usr/bin/sqlite3" ]; then
echo "sqlite3 could not be found. Installing it now..."
sudo apt-get update
sudo apt install -y sqlite3

else
echo "sqlite3 is already installed."
which sqlite3
fi
echo "Using sqlite3 from $(which sqlite3)"


# If the database already exists, exit. Otherwise, create it
Expand All @@ -51,15 +46,13 @@ else
echo "${ENTRY_QUERY}" | sqlite3 "${DATABASE_PATH}"
echo "${DROP_QUERY}" | sqlite3 "${DATABASE_PATH}"

# Set journal model to Write-Ahead Logging. This makes it much faster, at the risk of
# Set journal mode to Write-Ahead Logging. This makes it much faster, at the risk of
# possibly losing data if the machine crashes suddenly.
# https://www.sqlite.org/wal.html
echo "PRAGMA journal_mode=WAL;" | sqlite3 "${DATABASE_PATH}"
fi


# Reset tables if the first argument is "db_reset"
if [ "$1" == "db_reset" ]; then
echo "Resetting database tables..."
if [[ "${DB_RESET}" == "1" ]]; then
echo "Resetting tables..."
reset_tables
fi
29 changes: 26 additions & 3 deletions deploy/k3s/edge_deployment/edge_deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,28 @@ spec:
# This service account is used by the edge logic to access the Kubernetes API
# from within the pod. See deploy/k3s/service_account.yaml for more details
serviceAccountName: edge-endpoint-service-account
initContainers:
- name: database-prep
image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:miscellaneous-c290a1810-dirty-8c48a5f159e9542
imagePullPolicy: IfNotPresent
env:
# Flag to indicate whether or not to reset all database tables. Resetting WILL delete
# all existing data in the database, so set this flag to 1 with caution.
- name: DB_RESET
valueFrom:
configMapKeyRef:
name: db-reset
key: DB_RESET
volumeMounts:
- name: edge-endpoint-persistent-volume
mountPath: /opt/groundlight/edge/sqlite
- name: sqlite-db-setup-script-volume
mountPath: /scripts
command: ["/bin/bash", "/scripts/setup-db.sh"]

containers:
- name: edge-endpoint
image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:3e7357529-miscellaneous
image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:miscellaneous-c290a1810-dirty-8c48a5f159e9542
imagePullPolicy: IfNotPresent
ports:
- containerPort: 6717
Expand All @@ -64,7 +83,7 @@ spec:
mountPath: /opt/groundlight/edge/sqlite

- name: inference-model-updater
image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:3e7357529-miscellaneous
image: 723181461334.dkr.ecr.us-west-2.amazonaws.com/edge-endpoint:miscellaneous-c290a1810-dirty-8c48a5f159e9542
imagePullPolicy: IfNotPresent
command: ["/bin/bash", "-c"]
args: ["poetry run python -m app.model_updater.update_models"]
Expand All @@ -91,10 +110,11 @@ spec:
mountPath: /opt/groundlight/edge/sqlite

- name: edge-endpoint-persistent-volume
mountPath: /var/groundlight/serving/model_repository
mountPath: /opt/groundlight/edge/serving/model-repo

imagePullSecrets:
- name: registry-credentials

volumes:
- name: edge-config-volume
configMap:
Expand All @@ -105,6 +125,9 @@ spec:
- name: inference-deployment-template-volume
configMap:
name: inference-deployment-template
- name: sqlite-db-setup-script-volume
configMap:
name: setup-db
- name: edge-endpoint-persistent-volume
persistentVolumeClaim:
# This PVC is attached to an EFS volume.
Expand Down
File renamed without changes.
25 changes: 25 additions & 0 deletions deploy/k3s/local_persistent_volume.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: edge-endpoint-pv
spec:
capacity:
storage: 2Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: local-sc
hostPath:
path: /opt/groundlight/edge
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: edge-endpoint-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi
storageClassName: local-sc
8 changes: 4 additions & 4 deletions deploy/k3s/service_account.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: edge-endpoint-service-account
namespace: agot-edge-dev1
namespace: ${DEPLOYMENT_NAMESPACE}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: agot-edge-dev1
namespace: ${DEPLOYMENT_NAMESPACE}
name: limited-access-role
rules:
# We only need to allow access to the resources that the edge logic needs
Expand All @@ -39,11 +39,11 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: edge-endpoint-role-binding
namespace: agot-edge-dev1
namespace: ${DEPLOYMENT_NAMESPACE}
subjects:
- kind: ServiceAccount
name: edge-endpoint-service-account
namespace: agot-edge-dev1
namespace: ${DEPLOYMENT_NAMESPACE}
roleRef:
kind: Role
name: limited-access-role
Expand Down
10 changes: 10 additions & 0 deletions test-edge-endpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from groundlight import Groundlight

def main():
# For the edge, we don't actually need an API token here. Just the endpoint
sdk_client = Groundlight(endpoint="http://")



if __name__ == "__main__":
main()

0 comments on commit 5ba440d

Please sign in to comment.