diff --git a/files/scripts/workspaces_launch_test.py b/files/scripts/workspaces_launch_test.py new file mode 100644 index 000000000..4ab91e0d8 --- /dev/null +++ b/files/scripts/workspaces_launch_test.py @@ -0,0 +1,225 @@ +""" +You can run this script by running the following command: +python3 workspaces_launch_test.py --commons-url https://qa-heal.planx-pla.net --images "(Generic) Jupyter Lab Notebook with R Kernel+(Tutorials) Example Analysis Jupyter Lab Notebooks" --access-token eyJhbaccess.token +Multiple image names should be separated by a plus (+) sign. +""" +import time +import argparse +import json +import logging + +import requests + +workspace_internal_url = "http://workspace-token-service" +logging.basicConfig(level=logging.INFO, format='%(message)s') + +def main(): + args = parse_args() + tester = WorkspaceLaunchTest(commons_url=args.commons_url, access_token=args.access_token, images=args.images.split("+")) # Images passed from the kubernetes jobs is separated by a plus sign "+" + tester.initialize_workspace_launch_test() + +def parse_args(): + parser = argparse.ArgumentParser( + description="Test Workspaces Launch" + ) + parser.add_argument( + "--commons-url", + dest="commons_url", + help="Specify the Commons URL to test" + ) + parser.add_argument( + "--access-token", + dest="access_token", + help="User's access token. It should have the 'credentials' scope since the /launch api requires an access token that can use to get an api key.", + ) + parser.add_argument( + "--images", + dest="images", + help="Type of image to launch for testing." + ) + + return parser.parse_args() + +class WorkspaceLaunchTest: + def __init__(self, commons_url, access_token, images=["(Generic, Limited Gen3-licensed) Stata image"]): + self.commons_url = commons_url + self.workspace_internal_url = workspace_internal_url + self.token_expiration = 0 + self.headers = {} + self.start_time = 0 + self.end_time = 0 + self.access_token = access_token + self.launch_status = "Workspace did not launch. Something went wrong before launch." + self.reason_for_failure = None + self.status_response = None + self.images = images + self.update_headers() + + + def update_headers(self): + """Updates the headers with the current access token.""" + if self.access_token: + self.headers = {"Authorization": f"Bearer {self.access_token}"} + else: + self.headers = {} + + def get_info_for_image(self, image_name, options=None): + for option in options: + if option["name"] == image_name: + return option + return None + + + def initialize_workspace_launch_test(self): + + test_image_ids_map = {} # list of tuples containing image name and image ids + + # Get available workspace options + options_url = self.commons_url + "/lw-workspace/options" + try: + options_response = requests.get(options_url, headers=self.headers) + options_response.raise_for_status() + + except requests.exceptions.RequestException as e: + error_msg = f"Couldn't get workspace options with error: {e}" + logging.error(error_msg) + self.reason_for_failure = error_msg + + options = options_response.json() + logging.info("Successfully found workspace options") + logging.info(f"Found {len(options)} Workspace options: {options}") + + available_images = [option['name'] for option in options] + + for image in self.images: + if image in available_images: + test_image_ids_map[image] = self.get_info_for_image(image, options)["id"] + + logging.info(f"Images requested to test {self.images}") + logging.info(f"Images available to test {test_image_ids_map}") + + unavailable_images = set(self.images) - set(available_images) + if len(unavailable_images) != 0: + logging.warning(f"The following requested images are not available: {unavailable_images}") + + # Launch workspaces sequentially: + final_result = [] + number_of_images = len(test_image_ids_map) + number_of_runs = 0 + + for image_name, id in test_image_ids_map.items(): + logging.info(f"Testing image: {image_name}") + final_result.append(self.start_workspace_launch_test(image_name, id)) + logging.info(f"Finished testing image: {image_name}") + + number_of_runs += 1 + if number_of_images != number_of_runs: + logging.info("Waiting to launch next image...") + time.sleep(120) + + + logging.info("Completed all launch tests...") + logging.info("Results:") + logging.info(json.dumps(final_result)) + + def start_workspace_launch_test(self, image_name, workspace_id): + + # Launch workspace + launch_url = self.commons_url + "/lw-workspace/launch" + "?id=" + workspace_id + try: + launch_response = requests.post(launch_url, headers=self.headers) + launch_response.raise_for_status() + self.start_time = time.time() + except requests.exceptions.RequestException as e: + error_msg = f"Couldn't launch workspace. Error code with error: {e}" + logging.error(error_msg) + self.reason_for_failure = error_msg + return + + logging.info("Successfully started launching workspace. Starting timer and monitoring workspace status...") + + self.monitor_workspace_status() + + self.end_time = time.time() + logging.info(f"Workspace took {self.end_time-self.start_time} seconds to initialize") + + time.sleep(60) + + proxy_url = self.commons_url + "/lw-workspace/proxy/" + try: + logging.info("Trying to connect to workspace via proxy endpoint") + proxy_response = requests.get(proxy_url, headers=self.headers) + proxy_status_code = proxy_response.status_code + proxy_response.raise_for_status() + + except requests.exceptions.RequestException as e: + error_msg = f"Error connecting to workspace via proxy endpoint. Error: {e}" + + # logging.info("Connected to workspace via proxy endpoint") + # Terminate active running workspace + terminate_url = self.commons_url + "/lw-workspace/terminate" + try: + logging.info("Attempting to terminate workspace...") + terminate_response = requests.post(terminate_url, headers= self.headers) + terminate_response.raise_for_status() + logging.info("Workspace terminated...") + except requests.exceptions.RequestException as e: + error_msg = f"Couldn't terminate workspace with error : {e}" + logging.error(error_msg) + self.reason_for_failure = error_msg + + json_result = { + "iamge": image_name, + "workspace_id": workspace_id, + "start_time": self.start_time, + "end_time": self.end_time, + "duration": self.end_time - self.start_time, + "result": self.launch_status, + "reason_for_failure": self.reason_for_failure, + "status_response": self.status_response, + "proxy_status": proxy_status_code, + } + return json_result + + + def monitor_workspace_status(self, interval=10): + """ + In an interval of given time (in seconds) hit the workspace status endpoint to monitor the status of the workspace + + Args: + interval (int, optional): Interval (in seconds) to hit the options endpoint. Defaults to 10 seconds. + """ + status_url = self.commons_url + "/lw-workspace/status" + + while True: + try: + status_response = requests.get(status_url, headers=self.headers) + status_response.raise_for_status() + except requests.exceptions.RequestException as e: + error_msg = f"Error checking workspace status: {e}" + logging.error(error_msg) + self.reason_for_failure = error_msg + + logging.info("Launch Response:") + logging.info(json.dumps(status_response.json())) + + if status_response.json()["status"] == "Running": + self.launch_status = "Running" + self.status_response = status_response.json() + break + elif status_response.json()["status"] == "Not Found": + logging.error("Could not find workspace. Stopping status check...") + self.launch_status = "Not Found" + self.status_response = status_response.json() + break + + time.sleep(interval) + logging.info(f"Elapsed time: {time.time()-self.start_time}") + self.launch_status = status_response.json()["status"] + + self.status_response = status_response.json() + + +if __name__ == "__main__": + main() + diff --git a/kube/services/jobs/workspace-launch-test-cronjob.yaml b/kube/services/jobs/workspace-launch-test-cronjob.yaml new file mode 100644 index 000000000..f4f973a70 --- /dev/null +++ b/kube/services/jobs/workspace-launch-test-cronjob.yaml @@ -0,0 +1,197 @@ +# To run: gen3 job run workspace-launch-test-cronjob COMMONS_URL IMAGES +# example: gen3 job run workspace-launch-test-cronjob COMMONS_URL https://qa-heal.planx-pla.net IMAGES '(Generic) Jupyter Lab Image with R Kernel' '(Tutorials) Example Analysis Jupyter Lab Notebooks' +# If IMAGES isn't provided, it defaults to '(Generic) Jupyter Lab Image with R Kernel' + +apiVersion: batch/v1 +kind: CronJob +metadata: + name: workspace-launch-test-cron +spec: + schedule: "0 * * * *" # Schedule: Runs daily at 3 AM (adjust as needed) + jobTemplate: + spec: + template: + metadata: + labels: + app: gen3job + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 + preference: + matchExpressions: + - key: eks.amazonaws.com/capacityType + operator: In + values: + - ONDEMAND + containers: + - name: fence + GEN3_FENCE_IMAGE + imagePullPolicy: Always + env: + - name: PYTHONPATH + value: /var/www/fence + - name: TEST_OPERATOR + GEN3_TEST_OPERATOR|-value: "binamb@uchicago.edu"-| + - name: TOKEN_EXPIRATION + GEN3_TOKEN_EXPIRATION|-value: "3600"-| + - name: FENCE_PUBLIC_CONFIG + valueFrom: + configMapKeyRef: + name: manifest-fence + key: fence-config-public.yaml + optional: true + volumeMounts: + - name: "old-config-volume" + readOnly: true + mountPath: "/var/www/fence/local_settings.py" + subPath: local_settings.py + - name: "creds-volume" + readOnly: true + mountPath: "/var/www/fence/creds.json" + subPath: creds.json + - name: "config-helper" + readOnly: true + mountPath: "/var/www/fence/config_helper.py" + subPath: config_helper.py + - name: "json-secret-volume" + readOnly: true + mountPath: "/var/www/fence/fence_credentials.json" + subPath: fence_credentials.json + # ----------------------------------------------------------------------------- + - name: "config-volume" + readOnly: true + mountPath: "/var/www/fence/fence-config-secret.yaml" + subPath: fence-config.yaml + - name: "yaml-merge" + readOnly: true + mountPath: "/var/www/fence/yaml_merge.py" + subPath: yaml_merge.py + - name: "fence-jwt-keys" + readOnly: true + mountPath: "/fence/jwt-keys.tar" + subPath: "jwt-keys.tar" + - name: shared-data + mountPath: /mnt/shared + command: ["/bin/bash"] + args: + - "-c" + - | + echo "${FENCE_PUBLIC_CONFIG:-""}" > "/var/www/fence/fence-config-public.yaml" + python /var/www/fence/yaml_merge.py /var/www/fence/fence-config-public.yaml /var/www/fence/fence-config-secret.yaml > /var/www/fence/fence-config.yaml + if [ -f /fence/jwt-keys.tar ]; then + cd /fence + tar xvf jwt-keys.tar + if [ -d jwt-keys ]; then + mkdir -p keys + mv jwt-keys/* keys/ + fi + fi + echo "generate access token" + echo "fence-create --path fence token-create --type access_token --username $TEST_OPERATOR --scopes openid,user,test-client,credentials,data --exp $TOKEN_EXPIRATION" + tempFile="$(mktemp -p /tmp token.txt_XXXXXX)" + success=false + count=0 + sleepTime=10 + # retry loop + while [[ $count -lt 3 && $success == false ]]; do + if fence-create --path fence token-create --type access_token --username $TEST_OPERATOR --scopes openid,user,test-client,credentials,data --exp $TOKEN_EXPIRATION > "$tempFile"; then + echo "fence-create success!" + tail -1 "$tempFile" > /mnt/shared/access_token.txt + # base64 --decode complains about invalid characters - don't know why + awk -F . '{ print $2 }' /mnt/shared/access_token.txt | base64 --decode 2> /dev/null + success=true + else + echo "fence-create failed!" + cat "$tempFile" + echo "sleep for $sleepTime, then retry" + sleep "$sleepTime" + let sleepTime=$sleepTime+$sleepTime + fi + let count=$count+1 + done + if [[ $success != true ]]; then + echo "Giving up on fence-create after $count retries - failed to create valid access token" + fi + echo "" + echo "All Done - always succeed to avoid k8s retries" + - name: workspace-launch-test + env: + - name: COMMONS_URL + GEN3_COMMONS_URL|-value: ""-| + - name: IMAGES + GEN3_IMAGES|-value: ""-| + image: quay.io/cdis/awshelper:master + imagePullPolicy: Always + volumeMounts: + - name: shared-data + mountPath: /mnt/shared + resources: + limits: + cpu: 3 + memory: "512Mi" + requests: + cpu: "1" + memory: "256Mi" + command: ["/bin/bash"] + args: + - "-c" + - | + # wait for the access token to be created in the sidecar "fence" container + let count = 0 + while [[ ! -f /mnt/shared/access_token.txt && $count -lt 50 ]]; do + echo "waiting for /mnt/shared/access_token.txt"; + sleep 2 + let count=$count+1 + done + + export ACCESS_TOKEN="$(cat /mnt/shared/access_token.txt)" + + echo "Installing dependencies..." + python3 -m pip3 install requests + python3 -m pip3 install logging + + flags="--commons-url $COMMONS_URL --access-token $ACCESS_TOKEN" + + if [[ -n "$IMAGES" ]]; then + flags="$flags --images $IMAGES" + fi + + echo "Running cloud-automation/files/scripts/workspaces_launch_test.py $flags..." + python3 ~/cloud-automation/files/scripts/workspaces_launch_test.py $flags + echo "Exit code: $?" + + restartPolicy: Never + serviceAccountName: useryaml-job + volumes: + - name: yaml-merge + configMap: + name: "fence-yaml-merge" + - name: shared-data + emptyDir: {} + - name: old-config-volume + secret: + secretName: "fence-secret" + - name: creds-volume + secret: + secretName: "fence-creds" + - name: config-helper + configMap: + name: config-helper + - name: json-secret-volume + secret: + secretName: "fence-json-secret" + - name: config-volume + secret: + secretName: "fence-config" + - name: fence-jwt-keys + secret: + secretName: "fence-jwt-keys" diff --git a/kube/services/jobs/workspace-launch-test-job.yaml b/kube/services/jobs/workspace-launch-test-job.yaml new file mode 100644 index 000000000..dbfe59911 --- /dev/null +++ b/kube/services/jobs/workspace-launch-test-job.yaml @@ -0,0 +1,206 @@ +# To run: gen3 job run workspace-launch-test COMMONS_URL IMAGES +# example: gen3 job run workspace-launch-test COMMONS_URL https://qa-heal.planx-pla.net IMAGES "(Generic) Jupyter Lab Notebook with R Kernel+(Tutorials) Example Analysis Jupyter Lab Notebooks" +# If IMAGES isn't provided, it defaults to '(Generic) Jupyter Lab Image with R Kernel' + +apiVersion: batch/v1 +kind: Job +metadata: + name: workspace-launch-test +spec: + template: + metadata: + labels: + app: gen3job + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 + preference: + matchExpressions: + - key: eks.amazonaws.com/capacityType + operator: In + values: + - ONDEMAND + containers: + - name: fence + GEN3_FENCE_IMAGE + imagePullPolicy: Always + env: + - name: PYTHONPATH + value: /var/www/fence + - name: TEST_OPERATOR + GEN3_TEST_OPERATOR|-value: "binamb@uchicago.edu"-| + - name: TOKEN_EXPIRATION + GEN3_TOKEN_EXPIRATION|-value: "3600"-| + - name: FENCE_PUBLIC_CONFIG + valueFrom: + configMapKeyRef: + name: manifest-fence + key: fence-config-public.yaml + optional: true + volumeMounts: + - name: "old-config-volume" + readOnly: true + mountPath: "/var/www/fence/local_settings.py" + subPath: local_settings.py + - name: "creds-volume" + readOnly: true + mountPath: "/var/www/fence/creds.json" + subPath: creds.json + - name: "config-helper" + readOnly: true + mountPath: "/var/www/fence/config_helper.py" + subPath: config_helper.py + - name: "json-secret-volume" + readOnly: true + mountPath: "/var/www/fence/fence_credentials.json" + subPath: fence_credentials.json +# ----------------------------------------------------------------------------- + - name: "config-volume" + readOnly: true + mountPath: "/var/www/fence/fence-config-secret.yaml" + subPath: fence-config.yaml + - name: "yaml-merge" + readOnly: true + mountPath: "/var/www/fence/yaml_merge.py" + subPath: yaml_merge.py + - name: "fence-jwt-keys" + readOnly: true + mountPath: "/fence/jwt-keys.tar" + subPath: "jwt-keys.tar" + - name: shared-data + mountPath: /mnt/shared + command: ["/bin/bash" ] + args: + - "-c" + - | + echo "${FENCE_PUBLIC_CONFIG:-""}" > "/var/www/fence/fence-config-public.yaml" + python /var/www/fence/yaml_merge.py /var/www/fence/fence-config-public.yaml /var/www/fence/fence-config-secret.yaml > /var/www/fence/fence-config.yaml + if [ -f /fence/jwt-keys.tar ]; then + cd /fence + tar xvf jwt-keys.tar + if [ -d jwt-keys ]; then + mkdir -p keys + mv jwt-keys/* keys/ + fi + fi + echo "generate access token" + echo "fence-create --path fence token-create --type access_token --username $TEST_OPERATOR --scopes openid,user,test-client,credentials,data --exp $TOKEN_EXPIRATION" + tempFile="$(mktemp -p /tmp token.txt_XXXXXX)" + success=false + count=0 + sleepTime=10 + # retry loop + while [[ $count -lt 3 && $success == false ]]; do + if fence-create --path fence token-create --type access_token --username $TEST_OPERATOR --scopes openid,user,test-client,credentials,data --exp $TOKEN_EXPIRATION > "$tempFile"; then + echo "fence-create success!" + tail -1 "$tempFile" > /mnt/shared/access_token.txt + # base64 --decode complains about invalid characters - don't know why + awk -F . '{ print $2 }' /mnt/shared/access_token.txt | base64 --decode 2> /dev/null + success=true + else + echo "fence-create failed!" + cat "$tempFile" + echo "sleep for $sleepTime, then retry" + sleep "$sleepTime" + let sleepTime=$sleepTime+$sleepTime + fi + let count=$count+1 + done + if [[ $success != true ]]; then + echo "Giving up on fence-create after $count retries - failed to create valid access token" + fi + echo "" + echo "All Done - always succeed to avoid k8s retries" + - name: workspace-launch-test + env: + - name: COMMONS_URL + GEN3_COMMONS_URL|-value: ""-| + - name: IMAGES + GEN3_IMAGES|-value: ""-| + image: quay.io/cdis/awshelper:master + imagePullPolicy: Always + volumeMounts: + - name: shared-data + mountPath: /mnt/shared + resources: + limits: + cpu: 3 + memory: "512Mi" + requests: + cpu: "1" + memory: "256Mi" + command: [ "/bin/bash" ] + args: + - "-c" + - | + # wait for the access token to be created in the sidecar "fence" container + let count = 0 + while [[ ! -f /mnt/shared/access_token.txt && $count -lt 50 ]]; do + echo "waiting for /mnt/shared/access_token.txt"; + sleep 2 + let count=$count+1 + done + + export ACCESS_TOKEN="$(cat /mnt/shared/access_token.txt)" + + echo "Installing dependencies..." + python3 -m pip3 install requests + python3 -m pip3 install logging + + cmd="python3 cloud-automation/files/scripts/workspaces_launch_test.py --commons-url $COMMONS_URL --access-token $ACCESS_TOKEN" + + if [[ -n "$IMAGES" ]]; then + echo "Running command:" + echo $cmd --images "$IMAGES" # this is the only way parsing the images works for some reason. If its not done like this, it parses only the first unbroken set of strings (until the first space) as the argument in the python command + $cmd --images "$IMAGES" + else + echo "Running command:" + echo $cmd + $cmd + fi + + echo "Exit code: $?" + + restartPolicy: Never + serviceAccountName: useryaml-job + volumes: + - name: yaml-merge + configMap: + name: "fence-yaml-merge" + - name: shared-data + emptyDir: {} +# ----------------------------------------------------------------------------- +# DEPRECATED! Remove when all commons are no longer using local_settings.py +# for fence. +# ----------------------------------------------------------------------------- + - name: old-config-volume + secret: + secretName: "fence-secret" + - name: creds-volume + secret: + secretName: "fence-creds" + - name: config-helper + configMap: + name: config-helper + - name: json-secret-volume + secret: + secretName: "fence-json-secret" +# ----------------------------------------------------------------------------- + - name: config-volume + secret: + secretName: "fence-config" + - name: fence-jwt-keys + secret: + secretName: "fence-jwt-keys" + + + \ No newline at end of file