diff --git a/deployments/enterprise/terraform/README.md b/deployments/enterprise/terraform/README.md new file mode 100644 index 0000000..d287b13 --- /dev/null +++ b/deployments/enterprise/terraform/README.md @@ -0,0 +1,37 @@ +## Setup Terraform on AWS + +1. Create IAM Role: + +Terraform needs full access to all required AWS services. Therefore, create a IAM role that can be inferred by Terraform: +- TerraformServiceRole: + - Permission Policies: AdministratorAccess + - Trust Relationship: see json file "terraform_iam_role_trust_relationship.json" + +2. Create User: + +The created IAM role needs to be attached to a user who is member of a user group. +- User Group: CICD +- User: Gitlab + - Permission Policies: gitlab_permissions (see json file "gitlab_permissions.json") + +3. Create TF-State S3 Bucket: + +In order to make the Terraform deployment available for an organisation the Terraform state file must be persisted in a accessable S3 bucket. +- S3 Bucket: xxx-ml-ops-tfstate-production + + +## Prerequisites of Kubeflow Deployment + +1. Provision Domain: + +In order to make Kubeflow available via the internet a public domain is needed. AWS offers a service called Route53 which can be used for provision domain. +- Route53 > Hosted zones > Create hosted zone + +2. Setup Gitlab-CI: + +The following configurations must be made so that Teraform can be executed via CI/CD: +- Create Gitlab Variables: Settings > CI/CD > Variables + - AWS_ACCESS_KEY_ID (mask variable, expand variable reference) + - AWS_ACCOUNT_ID (mask variable, expand variable reference) + - AWS_SECRET_ACCESS_KEY (mask variable, expand variable reference) + - AWS_REGION (mask variable, expand variable reference) \ No newline at end of file diff --git a/kubeflow_templates/kfserve/custom_predictor_pytorch.py b/kubeflow_templates/kfserve/custom_predictor_pytorch.py new file mode 100644 index 0000000..b98c129 --- /dev/null +++ b/kubeflow_templates/kfserve/custom_predictor_pytorch.py @@ -0,0 +1,50 @@ +""" + +Customized model inference predictor: PyTorch + +""" + +import argparse +import numpy as np +import torch + +from kserve import Model, ModelServer +from torchvision import models +from typing import Dict + + +class AlexNetModel(Model): + def __init__(self, name: str): + super().__init__(name) + self.name = name + self.model = None + self.load() + self.ready = True + + def load(self): + self.model = models.alexnet(pretrained=True) + self.model.eval() + + def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict: + img_data = payload["instances"][0]["image"]["b64"] + raw_img_data = base64.b64decode(img_data) + input_image = Image.open(io.BytesIO(raw_img_data)) + preprocess = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]) + input_tensor = preprocess(input_image).unsqueeze(0) + output = self.model(input_tensor) + torch.nn.functional.softmax(output, dim=1) + values, top_5 = torch.topk(output, 5) + result = values.flatten().tolist() + response_id = generate_uuid() + return {"predictions": result} + + +if __name__ == "__main__": + model = AlexNetModel("custom-model") + ModelServer().start([model]) diff --git a/kubeflow_templates/kfserve/inference.py b/kubeflow_templates/kfserve/inference.py new file mode 100644 index 0000000..5669c87 --- /dev/null +++ b/kubeflow_templates/kfserve/inference.py @@ -0,0 +1,58 @@ +import requests +import os +import json + +from e2e.utils.utils import load_json_file + + +def run_inference_sample(): + # common vars + KUBEFLOW_DOMAIN = os.environ.get("KUBEFLOW_DOMAIN", "kubeflow.example.com") + PROFILE_NAMESPACE = os.environ.get("PROFILE_NAMESPACE", "kubeflow-user-example-com") + MODEL_NAME = os.environ.get("MODEL_NAME", "sklearn-iris") + AUTH_PROVIDER = os.environ.get("AUTH_PROVIDER", "dex") + + URL = f"https://{MODEL_NAME}.{PROFILE_NAMESPACE}.{KUBEFLOW_DOMAIN}/v1/models/{MODEL_NAME}:predict" + HEADERS = {"Host": f"{MODEL_NAME}.{PROFILE_NAMESPACE}.{KUBEFLOW_DOMAIN}"} + DASHBOARD_URL = f"https://kubeflow.{KUBEFLOW_DOMAIN}" + data = load_json_file("./utils/kserve/iris-input.json") + response = None + if AUTH_PROVIDER != "cognito": + PROFILE_USERNAME = os.environ.get("PROFILE_USERNAME", "user@example.com") + PASSWORD = os.environ.get("PASSWORD", "12341234") + + def session_cookie(host, login, password): + session = requests.Session() + response = session.get(host) + headers = { + "Content-Type": "application/x-www-form-urlencoded", + } + data = {"login": login, "password": password} + session.post(response.url, headers=headers, data=data) + session_cookie = session.cookies.get_dict()["authservice_session"] + return session_cookie + + cookie = { + "authservice_session": session_cookie( + DASHBOARD_URL, PROFILE_USERNAME, PASSWORD + ) + } + response = requests.post(URL, headers=HEADERS, json=data, cookies=cookie) + else: + HTTP_HEADER_NAME = os.environ.get("HTTP_HEADER_NAME", "x-api-key") + HTTP_HEADER_VALUE = os.environ.get("HTTP_HEADER_VALUE", "token1") + HEADERS[HTTP_HEADER_NAME] = HTTP_HEADER_VALUE + + response = requests.post(URL, headers=HEADERS, json=data) + + status_code = response.status_code + print("Status Code", status_code) + if status_code == 200: + print("JSON Response ", json.dumps(response.json(), indent=2)) + + else: + raise Exception("prediction failed, status code = ") + + +if __name__ == "__main__": + run_inference_sample() diff --git a/kubeflow_templates/kfserve/inference_payload.json b/kubeflow_templates/kfserve/inference_payload.json new file mode 100644 index 0000000..4570ccc --- /dev/null +++ b/kubeflow_templates/kfserve/inference_payload.json @@ -0,0 +1,16 @@ +{ + "instances": [ + [ + 6.8, + 2.8, + 4.8, + 1.4 + ], + [ + 6.0, + 3.4, + 4.5, + 1.6 + ] + ] +} \ No newline at end of file diff --git a/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/Dockerfile b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/Dockerfile new file mode 100644 index 0000000..bd24d9f --- /dev/null +++ b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.9-alpine + +RUN mkdir "/src" + +COPY kserve/predictor/custom_predictor_sklearn/src /src +COPY kserve/predictor/custom_predictor_sklearn/pyproject.toml /src + +WORKDIR /src + +RUN python -m pip install --upgrade pip +RUN python -m pip install setuptools +RUN python -m pip install poetry + +RUN poetry config virtualenvs.create false +RUN poetry install --no-dev + +ENTRYPOINT ["python", "task.py"] \ No newline at end of file diff --git a/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/docker_build.sh b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/docker_build.sh new file mode 100644 index 0000000..b17011e --- /dev/null +++ b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/docker_build.sh @@ -0,0 +1,9 @@ +export IMAGE_NAME=xxx +export IMAGE_TAG=xxx +export AWS_ACCOUNT_ID=xxx +export AWS_ACCOUNT_REGION=xxx + +aws ecr get-login-password --region $AWS_ACCOUNT_REGION | sudo docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_ACCOUNT_REGION.amazonaws.com +sudo docker build -t $IMAGE_NAME . +sudo docker tag $IMAGE_NAME:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_ACCOUNT_REGION.amazonaws.com/$IMAGE_NAME:$IMAGE_TAG +sudo docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_ACCOUNT_REGION.amazonaws.com/$IMAGE_NAME:$IMAGE_TAG \ No newline at end of file diff --git a/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/pyproject.toml b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/pyproject.toml new file mode 100644 index 0000000..91dde03 --- /dev/null +++ b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/pyproject.toml @@ -0,0 +1,20 @@ +[tool.poetry] +name = "predictor_sklearn" +version = "0.1.0" +description = "Predictor of endpoint inference pipeline" +authors = ["gianni "] + +[tool.poetry.dependencies] +python = "^3.9" +catboost = "^1.2.2" +pygam = "^0.9.0" +s3fs = "^2023.9.2" +scikit-learn = "^1.3.2" +xgboost = "^2.0.2" + +[tool.poetry.dev-dependencies] +pytest = "^5.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/rest_custom_serving_runtime_kserve.yaml b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/rest_custom_serving_runtime_kserve.yaml new file mode 100644 index 0000000..4177d63 --- /dev/null +++ b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/rest_custom_serving_runtime_kserve.yaml @@ -0,0 +1,9 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: custom-model +spec: + predictor: + containers: + - name: kserve-container + image: ${DOCKER_USER}/custom-model:v1 diff --git a/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/src/task.py b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/src/task.py new file mode 100644 index 0000000..bb9d14a --- /dev/null +++ b/kubeflow_templates/kfserve/predictor/custom_predictor_sklearn/src/task.py @@ -0,0 +1,58 @@ +""" + +Customized model inference predictor: Sklearn API (Non-Neural Networks) + +""" + +import argparse +import numpy as np +import torch + +from kserve import Model, ModelServer +from torchvision import models +from typing import Dict + + +class SupervisedMLPredictor(Model): + """ + Class for generating predictions used in inference endpoints of KServe + """ + def __init__(self, name: str): + super().__init__(name) + self.name: str = name + self.model = None + self.load() + + def load(self): + self.model = models.alexnet(pretrained=True) + self.model.eval() + + def preprocess(self, inputs: Dict, headers: Dict[str, str] = None) -> Dict: + return {'instances': [image_transform(instance) for instance in inputs['instances']]} + + def predict(self, payload: Dict, headers: Dict[str, str] = None) -> Dict: + img_data = payload["instances"][0]["image"]["b64"] + raw_img_data = base64.b64decode(img_data) + input_image = Image.open(io.BytesIO(raw_img_data)) + preprocess = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]) + input_tensor = preprocess(input_image).unsqueeze(0) + output = self.model(input_tensor) + torch.nn.functional.softmax(output, dim=1) + values, top_5 = torch.topk(output, 5) + result = values.flatten().tolist() + response_id = generate_uuid() + return {"predictions": result} + + def postprocess(self, inputs: Dict, headers: Dict[str, str] = None) -> Dict: + return inputs + + +if __name__ == "__main__": + model = Predictor("custom-model") + ModelServer().start([model]) diff --git a/kubeflow_templates/kubeflow_management/katib/kubeflow_katib_management.py b/kubeflow_templates/kubeflow_management/katib/kubeflow_katib_management.py new file mode 100644 index 0000000..0d34ebc --- /dev/null +++ b/kubeflow_templates/kubeflow_management/katib/kubeflow_katib_management.py @@ -0,0 +1,104 @@ +""" + +Management of katib setup + +""" + +import argparse +import subprocess + +from typing import List + + +PARSER = argparse.ArgumentParser(description="manage kubeflow hyperparameter tuning service called katib") +PARSER.add_argument('-aws_account_id', type=str, required=True, default=None, help='AWS account id') +PARSER.add_argument('-service_account_name', type=str, required=False, default='default-editor', help='name of the service account') +PARSER.add_argument('-aws_region', type=str, required=False, default='eu-central-1', help='AWS region code') +PARSER.add_argument('-cluster_name', type=str, required=False, default='kubeflow', help='name of the EKS cluster') +ARGS = PARSER.parse_args() + + +class KubeflowKatibManagementException(Exception): + """ + Class for handling exceptions from class KubeflowKatibManagement + """ + pass + + +class KubeflowKatibManagement: + """ + Class for handling Kubeflow katib management + """ + def __init__(self, + aws_account_id: str, + service_account_name: str, + aws_region: str = 'eu-central-1', + cluster_name: str = 'kubeflow' + ): + """ + :param aws_account_id: str + AWS Account ID + + :param service_account_name: str + Name of the service account name + + :param aws_region: str + AWS region + + :param cluster_name: str + Name of the EKS cluster + """ + self.service_account_name: str = service_account_name + self.cluster_name: str = cluster_name + self.aws_region: str = aws_region + self.aws_account_id: str = aws_account_id + self._login_to_eks_cluster() + + def _adjust_katib_config(self) -> str: + """ + Adjust Katib config yaml + + :return: str + Adjusted katib config yaml file + """ + _cmd: str = f"kubectl get configMap katib-config -n {self.cluster_name} -o yaml > katib_config.yaml" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + with open('katib_config.yaml', 'r') as file: + _katib_config_yaml = file.read() + _config_yaml: List[str] = ['apiVersion: v1', 'data:'] + _found_area: bool = False + for line in _katib_config_yaml.split('\n'): + if line.find('suggestion: |-') >= 0: + _found_area = True + if line.find('kind: ConfigMap') >= 0: + _found_area = False + _config_yaml.append(line) + if _found_area: + if line.find('"image":') >= 0: + _config_yaml.append(f' "serviceAccountName": "{self.service_account_name}"') + return "\n".join(_config_yaml) + + def _login_to_eks_cluster(self) -> None: + """ + Login to running EKS cluster + """ + _cmd: str = f"aws eks --region {self.aws_region} update-kubeconfig --name {self.cluster_name}" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + + def enable_katib(self) -> None: + """ + Enable katib by accessing S3 AWS services + """ + _katib_config_yaml: str = self._adjust_katib_config() + with open('new_katib_config.yaml', 'w') as file: + file.write(_katib_config_yaml) + subprocess.run('kubectl apply -f new_katib_config.yaml', shell=True, capture_output=False, text=True) + + +if __name__ == '__main__': + _kubeflow_katib_management: KubeflowKatibManagement = KubeflowKatibManagement(aws_account_id=ARGS.aws_account_id, + service_account_name=ARGS.service_account_name, + aws_region=ARGS.aws_region, + cluster_name=ARGS.cluster_name + ) + _kubeflow_katib_management.enable_katib() diff --git a/kubeflow_templates/kubeflow_management/model_endpoint/add_secret.yaml b/kubeflow_templates/kubeflow_management/model_endpoint/add_secret.yaml new file mode 100644 index 0000000..6629e8c --- /dev/null +++ b/kubeflow_templates/kubeflow_management/model_endpoint/add_secret.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: aws-secret + namespace: ${PROFILE_NAMESPACE} + annotations: + serving.kserve.io/s3-endpoint: s3.amazonaws.com + serving.kserve.io/s3-usehttps: "1" + serving.kserve.io/s3-region: ${CLUSTER_REGION} +type: Opaque \ No newline at end of file diff --git a/kubeflow_templates/kubeflow_management/model_endpoint/kubeflow_model_endpoint_management.py b/kubeflow_templates/kubeflow_management/model_endpoint/kubeflow_model_endpoint_management.py new file mode 100644 index 0000000..5b4835b --- /dev/null +++ b/kubeflow_templates/kubeflow_management/model_endpoint/kubeflow_model_endpoint_management.py @@ -0,0 +1,182 @@ +""" + +Management of model endpoints in kserve & knative + +""" + +import argparse +import subprocess + +from typing import List + + +PARSER = argparse.ArgumentParser(description="manage kubeflow model endpoint") +PARSER.add_argument('-aws_account_id', type=str, required=True, default=None, help='AWS account id') +PARSER.add_argument('-profile_namespace', type=str, required=True, default=None, help='name of the profile namespace') +PARSER.add_argument('-top_level_domain_name', type=str, required=True, default=None, help='name of the top level domain') +PARSER.add_argument('-second_level_domain_name', type=str, required=True, default=None, help='name of the second level domain') +PARSER.add_argument('-subdomain_name', type=str, required=True, default=None, help='name of the subdomain') +PARSER.add_argument('-service_account_name', type=str, required=False, default='default-editor', help='name of the service account') +PARSER.add_argument('-aws_region', type=str, required=False, default='eu-central-1', help='AWS region code') +PARSER.add_argument('-cluster_name', type=str, required=False, default='kubeflow', help='name of the EKS cluster') +PARSER.add_argument('-ecr_iam_role_policy_name', type=str, required=False, default='AmazonEC2ContainerRegistryReadOnly', help='name of the ECR IAM policy attached to IAM role for service account (IRSA)') +PARSER.add_argument('-s3_iam_role_policy_name', type=str, required=False, default='AmazonS3ReadOnlyAccess', help='name of the S3 IAM policy attached to IAM role for service account (IRSA)') +PARSER.add_argument('-meth', type=str, required=True, default=None, help='method name of class KubeflowModelEndpointManagement to apply') +ARGS = PARSER.parse_args() + + +class KubeflowModelEndpointManagementException(Exception): + """ + Class for handling exceptions from class KubeflowModelEndpointManagement + """ + pass + + +class KubeflowModelEndpointManagement: + """ + Class for handling Kubeflow model endpoint management + """ + def __init__(self, + aws_account_id: str, + profile_namespace: str, + top_level_domain_name: str, + second_level_domain_name: str, + subdomain_name: str = None, + service_account_name: str = 'default-editor', + aws_region: str = 'eu-central-1', + cluster_name: str = 'kubeflow', + ecr_iam_role_policy_name: str = 'AmazonEC2ContainerRegistryReadOnly', + s3_iam_role_policy_name: str = 'AmazonS3ReadOnlyAccess', + ): + """ + :param aws_account_id: str + AWS Account ID + + :param profile_namespace: str + Name of the profile namespace + + :param top_level_domain_name: str + Name of the top level domain + + :param second_level_domain_name: str + Name of the second level domain + + :param subdomain_name: str + Name of the subdomain + + :param service_account_name: str + Name of the service account + + :param aws_region: str + AWS region + + :param cluster_name: str + Name of the EKS cluster + + :param ecr_iam_role_policy_name: str + Name of the ECR IAM role policy + + :param s3_iam_role_policy_name: str + Name of the S3 IAM role policy + """ + self.domain_name: str = f'{subdomain_name}.{second_level_domain_name}.{top_level_domain_name}' + self.profile_namespace: str = profile_namespace + self.service_account_name: str = service_account_name + self.cluster_name: str = cluster_name + self.aws_region: str = aws_region + self.aws_account_id: str = aws_account_id + self.ecr_iam_role_policy_name: str = ecr_iam_role_policy_name + self.s3_iam_role_policy_name: str = s3_iam_role_policy_name + self._login_to_eks_cluster() + + def _adjust_default_domain_knative_services(self) -> str: + """ + Adjust default domain for all knative services + + :return: str + Adjusted knative service configmap yaml file + """ + _cmd: str = "kubectl get configmap config-domain -n knative-serving -o yaml > knative_service_configmap.yaml" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + with open('knative_service_configmap.yaml', 'r') as file: + _knative_service_configmap_yaml = file.read() + _config_yaml: List[str] = ['apiVersion: v1', 'data:', f' {self.domain_name}: ""'] + _ignore_line: bool = False + for line in _knative_service_configmap_yaml.split('\n'): + if line.find('_example: |') >= 0: + _ignore_line = True + if line.find('kind: ConfigMap') >= 0: + _ignore_line = False + if not _ignore_line: + _config_yaml.append(line) + return "\n".join(_config_yaml) + + def _attach_secret_to_irsa(self) -> None: + """ + Attach secret to IRSA in profile namespace + """ + _p: str = '{"secrets": [{"name": "aws-secret"}]}' + _cmd: str = f"kubectl patch serviceaccount {self.service_account_name} -n {self.profile_namespace} -p '{_p}'" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + + def _create_irsa(self) -> None: + """ + Create IAM role for service account (IRSA) + """ + _cmd: str = f"eksctl create iamserviceaccount --name {self.service_account_name} --namespace {self.profile_namespace} --cluster {self.cluster_name} --region {self.aws_region} --attach-policy-arn=arn:aws:iam::aws:policy/{self.ecr_iam_role_policy_name} --attach-policy-arn=arn:aws:iam::aws:policy/{self.s3_iam_role_policy_name} --override-existing-serviceaccounts --approve" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + + def _create_secret(self) -> None: + """ + Create secret + """ + with open('add_secret.yaml', 'r') as file: + _secret_yaml = file.read() + _secret_yaml = _secret_yaml.replace("$(PROFILE_NAME)", self.profile_namespace) + with open('new_secret.yaml', 'w') as file: + file.write(_secret_yaml) + subprocess.run('kubectl apply -f new_secret.yaml', shell=True, capture_output=False, text=True) + + def _login_to_eks_cluster(self) -> None: + """ + Login to running EKS cluster + """ + _cmd: str = f"aws eks --region {self.aws_region} update-kubeconfig --name {self.cluster_name}" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + + def add_domain(self) -> None: + """ + Add custom domain to knative services + """ + _adjusted_knative_service_configmap_yaml: str = self._adjust_default_domain_knative_services() + with open('new_knative_service_configmap.yaml', 'w') as file: + file.write(_adjusted_knative_service_configmap_yaml) + subprocess.run('kubectl apply -f new_knative_service_configmap.yaml', shell=True, capture_output=False, text=True) + + def enable_inference_service(self) -> None: + """ + Enable inference service by accessing ECR and S3 AWS services + """ + self._create_irsa() + self._create_secret() + self._attach_secret_to_irsa() + + +if __name__ == '__main__': + _kubeflow_model_endpoint_management: KubeflowModelEndpointManagement = KubeflowModelEndpointManagement(aws_account_id=ARGS.aws_account_id, + profile_namespace=ARGS.profile_namespace, + top_level_domain_name=ARGS.top_level_domain_name, + second_level_domain_name=ARGS.second_level_domain_name, + subdomain_name=ARGS.subdomain_name, + service_account_name=ARGS.service_account_name, + aws_region=ARGS.aws_region, + cluster_name=ARGS.cluster_name, + ecr_iam_role_policy_name=ARGS.ecr_iam_role_policy_name, + s3_iam_role_policy_name=ARGS.s3_iam_role_policy_name + ) + if ARGS.meth == 'add_domain': + _kubeflow_model_endpoint_management.add_domain() + elif ARGS.meth == 'enable_inference_service': + _kubeflow_model_endpoint_management.enable_inference_service() + else: + raise KubeflowModelEndpointManagementException(f'Method ({ARGS.meth}) not supported') diff --git a/kubeflow_templates/kubeflow_management/tensorboard/kubeflow_tensorboard_management.py b/kubeflow_templates/kubeflow_management/tensorboard/kubeflow_tensorboard_management.py new file mode 100644 index 0000000..743e165 --- /dev/null +++ b/kubeflow_templates/kubeflow_management/tensorboard/kubeflow_tensorboard_management.py @@ -0,0 +1,111 @@ +""" + +Management of tensorboard setup + +""" + +import argparse +import subprocess + + +PARSER = argparse.ArgumentParser(description="manage kubeflow tensorboard") +PARSER.add_argument('-aws_account_id', type=str, required=True, default=None, help='AWS account id') +PARSER.add_argument('-profile_namespace', type=str, required=True, default=None, help='name of the profile namespace') +PARSER.add_argument('-service_account_name', type=str, required=False, default='default-editor', help='name of the service account') +PARSER.add_argument('-aws_region', type=str, required=False, default='eu-central-1', help='AWS region code') +PARSER.add_argument('-cluster_name', type=str, required=False, default='kubeflow', help='name of the EKS cluster') +PARSER.add_argument('-s3_iam_role_policy_name', type=str, required=False, default='AmazonS3FullAccess', help='name of the S3 IAM policy attached to IAM role for service account (IRSA)') +ARGS = PARSER.parse_args() + + +class KubeflowTensorboardManagementException(Exception): + """ + Class for handling exceptions from class KubeflowTensorboard + """ + pass + + +class KubeflowTensorboard: + """ + Class for handling Kubeflow tensorboard management + """ + def __init__(self, + aws_account_id: str, + profile_namespace: str, + service_account_name: str = 'default-editor', + aws_region: str = 'eu-central-1', + cluster_name: str = 'kubeflow', + s3_iam_role_policy_name: str = 'AmazonS3ReadOnlyAccess', + ): + """ + :param aws_account_id: str + AWS Account ID + + :param profile_namespace: str + Name of the profile namespace + + :param service_account_name: str + Name of the service account name + + :param aws_region: str + AWS region + + :param cluster_name: str + Name of the EKS cluster + + :param s3_iam_role_policy_name: str + Name of the S3 IAM role policy + """ + self.profile_namespace: str = profile_namespace + self.service_account_name: str = service_account_name + self.cluster_name: str = cluster_name + self.aws_region: str = aws_region + self.aws_account_id: str = aws_account_id + self.s3_iam_role_policy_name: str = s3_iam_role_policy_name + self._login_to_eks_cluster() + + def _adjust_s3_poddefault(self) -> None: + """ + Adjust S3 poddefault yaml + """ + _cmd: str = "kubectl get configmap config-domain -n knative-serving -o yaml > knative_service_configmap.yaml" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + with open('s3_poddefault.yaml', 'r') as file: + _s3_poddefault_yaml = file.read() + _s3_poddefault_yaml = _s3_poddefault_yaml.replace("${AWS_REGION}", self.aws_region) + _s3_poddefault_yaml = _s3_poddefault_yaml.replace("${SERVICE_ACCOUNT_NAME}", self.service_account_name) + with open('new_s3_poddefault.yaml', 'w') as file: + file.write(_s3_poddefault_yaml) + subprocess.run(f'kubectl apply -f new_s3_poddefault.yaml -n {self.profile_namespace}', shell=True, capture_output=False, text=True) + + def _create_irsa(self) -> None: + """ + Create IAM role for service account (IRSA) + """ + _cmd: str = f"eksctl create iamserviceaccount --name {self.service_account_name} --namespace {self.profile_namespace} --cluster {self.cluster_name} --region {self.aws_region} --attach-policy-arn=arn:aws:iam::aws:policy/{self.s3_iam_role_policy_name} --override-existing-serviceaccounts --approve" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + + def _login_to_eks_cluster(self) -> None: + """ + Login to running EKS cluster + """ + _cmd: str = f"aws eks --region {self.aws_region} update-kubeconfig --name {self.cluster_name}" + subprocess.run(_cmd, shell=True, capture_output=False, text=True) + + def enable_tensorboard(self) -> None: + """ + Enable tensorboard by accessing S3 AWS services + """ + self._create_irsa() + self._adjust_s3_poddefault() + + +if __name__ == '__main__': + _kubeflow_tensorboard_management: KubeflowTensorboard = KubeflowTensorboard(aws_account_id=ARGS.aws_account_id, + profile_namespace=ARGS.profile_namespace, + service_account_name=ARGS.service_account_name, + aws_region=ARGS.aws_region, + cluster_name=ARGS.cluster_name, + s3_iam_role_policy_name=ARGS.s3_iam_role_policy_name + ) + _kubeflow_tensorboard_management.enable_tensorboard() diff --git a/kubeflow_templates/kubeflow_management/tensorboard/s3_poddefault.yaml b/kubeflow_templates/kubeflow_management/tensorboard/s3_poddefault.yaml new file mode 100644 index 0000000..166f94e --- /dev/null +++ b/kubeflow_templates/kubeflow_management/tensorboard/s3_poddefault.yaml @@ -0,0 +1,13 @@ +apiVersion: kubeflow.org/v1alpha1 +kind: PodDefault +metadata: + name: tensorboard-s3-config +spec: + desc: S3 config for ${AWS_REGION} region + selector: + matchLabels: + tb-s3-config: "true" + env: + - name: AWS_REGION + value: ${AWS_REGION} + serviceAccountName: ${SERVICE_ACCOUNT_NAME} \ No newline at end of file diff --git a/kubeflow_templates/kubeflow_management/user_management/cognito/add_profile.yaml b/kubeflow_templates/kubeflow_management/user_management/cognito/add_profile.yaml new file mode 100644 index 0000000..15741dd --- /dev/null +++ b/kubeflow_templates/kubeflow_management/user_management/cognito/add_profile.yaml @@ -0,0 +1,13 @@ +apiVersion: kubeflow.org/v1 +kind: Profile +metadata: + name: $(PROFILE_NAME) # e.g.: kubeflow-user +spec: + owner: + kind: User + name: $(PROFILE_USER) # e.g.: user@example.com + plugins: + - kind: AwsIamForServiceAccount + spec: + awsIamRole: arn:aws:iam::$(ACCOUNT_ID):role/$(IAM_ROLE_NAME) # $(aws iam get-role --role-name $PROFILE_NAME-$CLUSTER_NAME-role --output text --query 'Role.Arn') + annotateOnly: true diff --git a/kubeflow_templates/kubeflow_management/user_management/cognito/add_profile_pod_default.yaml b/kubeflow_templates/kubeflow_management/user_management/cognito/add_profile_pod_default.yaml new file mode 100644 index 0000000..a55c9a5 --- /dev/null +++ b/kubeflow_templates/kubeflow_management/user_management/cognito/add_profile_pod_default.yaml @@ -0,0 +1,28 @@ +apiVersion: kubeflow.org/v1alpha1 +kind: PodDefault +metadata: + name: access-ml-pipeline + namespace: "$(PROFILE_NAME)" +spec: + desc: Allow access to Kubeflow Pipelines + selector: + matchLabels: + access-ml-pipeline: "true" + env: + - ## this environment variable is automatically read by `kfp.Client()` + ## this is the default value, but we show it here for clarity + name: KF_PIPELINES_SA_TOKEN_PATH + value: /var/run/secrets/kubeflow/pipelines/token + volumes: + - name: volume-kf-pipeline-token + projected: + sources: + - serviceAccountToken: + path: token + expirationSeconds: 7200 + ## defined by the `TOKEN_REVIEW_AUDIENCE` environment variable on the `ml-pipeline` deployment + audience: pipelines.kubeflow.org + volumeMounts: + - mountPath: /var/run/secrets/kubeflow/pipelines + name: volume-kf-pipeline-token + readOnly: true