Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
giannibalistreri committed Dec 30, 2023
1 parent 91fbbda commit b7a9c4f
Show file tree
Hide file tree
Showing 16 changed files with 735 additions and 0 deletions.
37 changes: 37 additions & 0 deletions deployments/enterprise/terraform/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
## Setup Terraform on AWS

1. Create IAM Role:

Terraform needs full access to all required AWS services. Therefore, create a IAM role that can be inferred by Terraform:
- TerraformServiceRole:
- Permission Policies: AdministratorAccess
- Trust Relationship: see json file "terraform_iam_role_trust_relationship.json"

2. Create User:

The created IAM role needs to be attached to a user who is member of a user group.
- User Group: CICD
- User: Gitlab
- Permission Policies: gitlab_permissions (see json file "gitlab_permissions.json")

3. Create TF-State S3 Bucket:

In order to make the Terraform deployment available for an organisation the Terraform state file must be persisted in a accessable S3 bucket.
- S3 Bucket: xxx-ml-ops-tfstate-production


## Prerequisites of Kubeflow Deployment

1. Provision Domain:

In order to make Kubeflow available via the internet a public domain is needed. AWS offers a service called Route53 which can be used for provision domain.
- Route53 > Hosted zones > Create hosted zone

2. Setup Gitlab-CI:

The following configurations must be made so that Teraform can be executed via CI/CD:
- Create Gitlab Variables: Settings > CI/CD > Variables
- AWS_ACCESS_KEY_ID (mask variable, expand variable reference)
- AWS_ACCOUNT_ID (mask variable, expand variable reference)
- AWS_SECRET_ACCESS_KEY (mask variable, expand variable reference)
- AWS_REGION (mask variable, expand variable reference)
50 changes: 50 additions & 0 deletions kubeflow_templates/kfserve/custom_predictor_pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Customized model inference predictor: PyTorch
"""

import argparse
import numpy as np
import torch

from kserve import Model, ModelServer
from torchvision import models
from typing import Dict


class AlexNetModel(Model):
def __init__(self, name: str):
super().__init__(name)
self.name = name
self.model = None
self.load()
self.ready = True

def load(self):
self.model = models.alexnet(pretrained=True)
self.model.eval()

def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict:
img_data = payload["instances"][0]["image"]["b64"]
raw_img_data = base64.b64decode(img_data)
input_image = Image.open(io.BytesIO(raw_img_data))
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image).unsqueeze(0)
output = self.model(input_tensor)
torch.nn.functional.softmax(output, dim=1)
values, top_5 = torch.topk(output, 5)
result = values.flatten().tolist()
response_id = generate_uuid()
return {"predictions": result}


if __name__ == "__main__":
model = AlexNetModel("custom-model")
ModelServer().start([model])
58 changes: 58 additions & 0 deletions kubeflow_templates/kfserve/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import requests
import os
import json

from e2e.utils.utils import load_json_file


def run_inference_sample():
# common vars
KUBEFLOW_DOMAIN = os.environ.get("KUBEFLOW_DOMAIN", "kubeflow.example.com")
PROFILE_NAMESPACE = os.environ.get("PROFILE_NAMESPACE", "kubeflow-user-example-com")
MODEL_NAME = os.environ.get("MODEL_NAME", "sklearn-iris")
AUTH_PROVIDER = os.environ.get("AUTH_PROVIDER", "dex")

URL = f"https://{MODEL_NAME}.{PROFILE_NAMESPACE}.{KUBEFLOW_DOMAIN}/v1/models/{MODEL_NAME}:predict"
HEADERS = {"Host": f"{MODEL_NAME}.{PROFILE_NAMESPACE}.{KUBEFLOW_DOMAIN}"}
DASHBOARD_URL = f"https://kubeflow.{KUBEFLOW_DOMAIN}"
data = load_json_file("./utils/kserve/iris-input.json")
response = None
if AUTH_PROVIDER != "cognito":
PROFILE_USERNAME = os.environ.get("PROFILE_USERNAME", "[email protected]")
PASSWORD = os.environ.get("PASSWORD", "12341234")

def session_cookie(host, login, password):
session = requests.Session()
response = session.get(host)
headers = {
"Content-Type": "application/x-www-form-urlencoded",
}
data = {"login": login, "password": password}
session.post(response.url, headers=headers, data=data)
session_cookie = session.cookies.get_dict()["authservice_session"]
return session_cookie

cookie = {
"authservice_session": session_cookie(
DASHBOARD_URL, PROFILE_USERNAME, PASSWORD
)
}
response = requests.post(URL, headers=HEADERS, json=data, cookies=cookie)
else:
HTTP_HEADER_NAME = os.environ.get("HTTP_HEADER_NAME", "x-api-key")
HTTP_HEADER_VALUE = os.environ.get("HTTP_HEADER_VALUE", "token1")
HEADERS[HTTP_HEADER_NAME] = HTTP_HEADER_VALUE

response = requests.post(URL, headers=HEADERS, json=data)

status_code = response.status_code
print("Status Code", status_code)
if status_code == 200:
print("JSON Response ", json.dumps(response.json(), indent=2))

else:
raise Exception("prediction failed, status code = ")


if __name__ == "__main__":
run_inference_sample()
16 changes: 16 additions & 0 deletions kubeflow_templates/kfserve/inference_payload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"instances": [
[
6.8,
2.8,
4.8,
1.4
],
[
6.0,
3.4,
4.5,
1.6
]
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM python:3.9-alpine

RUN mkdir "/src"

COPY kserve/predictor/custom_predictor_sklearn/src /src
COPY kserve/predictor/custom_predictor_sklearn/pyproject.toml /src

WORKDIR /src

RUN python -m pip install --upgrade pip
RUN python -m pip install setuptools
RUN python -m pip install poetry

RUN poetry config virtualenvs.create false
RUN poetry install --no-dev

ENTRYPOINT ["python", "task.py"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
export IMAGE_NAME=xxx
export IMAGE_TAG=xxx
export AWS_ACCOUNT_ID=xxx
export AWS_ACCOUNT_REGION=xxx

aws ecr get-login-password --region $AWS_ACCOUNT_REGION | sudo docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_ACCOUNT_REGION.amazonaws.com
sudo docker build -t $IMAGE_NAME .
sudo docker tag $IMAGE_NAME:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_ACCOUNT_REGION.amazonaws.com/$IMAGE_NAME:$IMAGE_TAG
sudo docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_ACCOUNT_REGION.amazonaws.com/$IMAGE_NAME:$IMAGE_TAG
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[tool.poetry]
name = "predictor_sklearn"
version = "0.1.0"
description = "Predictor of endpoint inference pipeline"
authors = ["gianni <[email protected]>"]

[tool.poetry.dependencies]
python = "^3.9"
catboost = "^1.2.2"
pygam = "^0.9.0"
s3fs = "^2023.9.2"
scikit-learn = "^1.3.2"
xgboost = "^2.0.2"

[tool.poetry.dev-dependencies]
pytest = "^5.2"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: custom-model
spec:
predictor:
containers:
- name: kserve-container
image: ${DOCKER_USER}/custom-model:v1
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Customized model inference predictor: Sklearn API (Non-Neural Networks)
"""

import argparse
import numpy as np
import torch

from kserve import Model, ModelServer
from torchvision import models
from typing import Dict


class SupervisedMLPredictor(Model):
"""
Class for generating predictions used in inference endpoints of KServe
"""
def __init__(self, name: str):
super().__init__(name)
self.name: str = name
self.model = None
self.load()

def load(self):
self.model = models.alexnet(pretrained=True)
self.model.eval()

def preprocess(self, inputs: Dict, headers: Dict[str, str] = None) -> Dict:
return {'instances': [image_transform(instance) for instance in inputs['instances']]}

def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict:
img_data = payload["instances"][0]["image"]["b64"]
raw_img_data = base64.b64decode(img_data)
input_image = Image.open(io.BytesIO(raw_img_data))
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image).unsqueeze(0)
output = self.model(input_tensor)
torch.nn.functional.softmax(output, dim=1)
values, top_5 = torch.topk(output, 5)
result = values.flatten().tolist()
response_id = generate_uuid()
return {"predictions": result}

def postprocess(self, inputs: Dict, headers: Dict[str, str] = None) -> Dict:
return inputs


if __name__ == "__main__":
model = Predictor("custom-model")
ModelServer().start([model])
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""
Management of katib setup
"""

import argparse
import subprocess

from typing import List


PARSER = argparse.ArgumentParser(description="manage kubeflow hyperparameter tuning service called katib")
PARSER.add_argument('-aws_account_id', type=str, required=True, default=None, help='AWS account id')
PARSER.add_argument('-service_account_name', type=str, required=False, default='default-editor', help='name of the service account')
PARSER.add_argument('-aws_region', type=str, required=False, default='eu-central-1', help='AWS region code')
PARSER.add_argument('-cluster_name', type=str, required=False, default='kubeflow', help='name of the EKS cluster')
ARGS = PARSER.parse_args()


class KubeflowKatibManagementException(Exception):
"""
Class for handling exceptions from class KubeflowKatibManagement
"""
pass


class KubeflowKatibManagement:
"""
Class for handling Kubeflow katib management
"""
def __init__(self,
aws_account_id: str,
service_account_name: str,
aws_region: str = 'eu-central-1',
cluster_name: str = 'kubeflow'
):
"""
:param aws_account_id: str
AWS Account ID
:param service_account_name: str
Name of the service account name
:param aws_region: str
AWS region
:param cluster_name: str
Name of the EKS cluster
"""
self.service_account_name: str = service_account_name
self.cluster_name: str = cluster_name
self.aws_region: str = aws_region
self.aws_account_id: str = aws_account_id
self._login_to_eks_cluster()

def _adjust_katib_config(self) -> str:
"""
Adjust Katib config yaml
:return: str
Adjusted katib config yaml file
"""
_cmd: str = f"kubectl get configMap katib-config -n {self.cluster_name} -o yaml > katib_config.yaml"
subprocess.run(_cmd, shell=True, capture_output=False, text=True)
with open('katib_config.yaml', 'r') as file:
_katib_config_yaml = file.read()
_config_yaml: List[str] = ['apiVersion: v1', 'data:']
_found_area: bool = False
for line in _katib_config_yaml.split('\n'):
if line.find('suggestion: |-') >= 0:
_found_area = True
if line.find('kind: ConfigMap') >= 0:
_found_area = False
_config_yaml.append(line)
if _found_area:
if line.find('"image":') >= 0:
_config_yaml.append(f' "serviceAccountName": "{self.service_account_name}"')
return "\n".join(_config_yaml)

def _login_to_eks_cluster(self) -> None:
"""
Login to running EKS cluster
"""
_cmd: str = f"aws eks --region {self.aws_region} update-kubeconfig --name {self.cluster_name}"
subprocess.run(_cmd, shell=True, capture_output=False, text=True)

def enable_katib(self) -> None:
"""
Enable katib by accessing S3 AWS services
"""
_katib_config_yaml: str = self._adjust_katib_config()
with open('new_katib_config.yaml', 'w') as file:
file.write(_katib_config_yaml)
subprocess.run('kubectl apply -f new_katib_config.yaml', shell=True, capture_output=False, text=True)


if __name__ == '__main__':
_kubeflow_katib_management: KubeflowKatibManagement = KubeflowKatibManagement(aws_account_id=ARGS.aws_account_id,
service_account_name=ARGS.service_account_name,
aws_region=ARGS.aws_region,
cluster_name=ARGS.cluster_name
)
_kubeflow_katib_management.enable_katib()
Loading

0 comments on commit b7a9c4f

Please sign in to comment.