diff --git a/.secrets.baseline b/.secrets.baseline index 9b3abca..5548e9d 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -145,13 +145,31 @@ "line_number": 64 } ], + "docs/local_installation.md": [ + { + "type": "Secret Keyword", + "filename": "docs/local_installation.md", + "hashed_secret": "08d2e98e6754af941484848930ccbaddfefe13d6", + "is_verified": false, + "line_number": 94 + } + ], + "docs/s3.md": [ + { + "type": "Secret Keyword", + "filename": "docs/s3.md", + "hashed_secret": "08d2e98e6754af941484848930ccbaddfefe13d6", + "is_verified": false, + "line_number": 56 + } + ], "gen3workflow/config-default.yaml": [ { "type": "Secret Keyword", "filename": "gen3workflow/config-default.yaml", "hashed_secret": "afc848c316af1a89d49826c5ae9d00ed769415f3", "is_verified": false, - "line_number": 27 + "line_number": 32 } ], "migrations/versions/e1886270d9d2_create_system_key_table.py": [ @@ -169,7 +187,7 @@ "filename": "tests/conftest.py", "hashed_secret": "0dd78d9147bb410f0cb0199c5037da36594f77d8", "is_verified": false, - "line_number": 188 + "line_number": 222 } ], "tests/migrations/test_migration_e1886270d9d2.py": [ @@ -180,7 +198,16 @@ "is_verified": false, "line_number": 24 } + ], + "tests/test-gen3workflow-config.yaml": [ + { + "type": "Secret Keyword", + "filename": "tests/test-gen3workflow-config.yaml", + "hashed_secret": "900a7331f7bf83bff0e1b2c77f471b4a5145da0f", + "is_verified": false, + "line_number": 5 + } ] }, - "generated_at": "2024-11-19T19:43:31Z" + "generated_at": "2024-12-12T23:42:54Z" } diff --git a/README.md b/README.md index 8b48b55..8eab625 100644 --- a/README.md +++ b/README.md @@ -13,3 +13,4 @@ The documentation can be browsed in the [docs](docs) folder, and key documents a * [Detailed API Documentation](http://petstore.swagger.io/?url=https://raw.githubusercontent.com/uc-cdis/gen3-workflow/master/docs/openapi.yaml) * [Local installation](docs/local_installation.md) * [Authorization](docs/authorization.md) +* [S3 interaction](docs/s3.md) diff --git a/docs/local_installation.md b/docs/local_installation.md index cbf69b7..e95b622 100644 --- a/docs/local_installation.md +++ b/docs/local_installation.md @@ -75,7 +75,8 @@ Try out the API at or `http://localhost:8080` is where Gen3Workflow runs by default when started with `python run.py`. +> The Gen3Workflow URL should be set to `http://localhost:8080` in this case; this is where the service runs by default when started with `python run.py`. + +- Run a workflow: -Run a workflow: +When setting your token manually: ``` +export GEN3_TOKEN= nextflow run hello ``` +Or, with the [Gen3 Python SDK](https://github.com/uc-cdis/gen3sdk-python) configured with an API key: +``` +gen3 run nextflow run hello +``` ## AWS access diff --git a/docs/openapi.yaml b/docs/openapi.yaml index a5ef6ec..0e384f9 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -41,7 +41,7 @@ openapi: 3.1.0 paths: /: get: - operationId: get_status__get + operationId: get_status_2 responses: '200': content: @@ -55,7 +55,7 @@ paths: - System /_status: get: - operationId: get_status__status_get + operationId: get_status responses: '200': content: @@ -69,7 +69,7 @@ paths: - System /_version: get: - operationId: get_version__version_get + operationId: get_version responses: '200': content: @@ -83,7 +83,7 @@ paths: - System /ga4gh/tes/v1/service-info: get: - operationId: service_info_ga4gh_tes_v1_service_info_get + operationId: service_info responses: '200': content: @@ -95,7 +95,7 @@ paths: - GA4GH TES /ga4gh/tes/v1/tasks: get: - operationId: list_tasks_ga4gh_tes_v1_tasks_get + operationId: list_tasks responses: '200': content: @@ -108,7 +108,7 @@ paths: tags: - GA4GH TES post: - operationId: create_task_ga4gh_tes_v1_tasks_post + operationId: create_task responses: '200': content: @@ -122,7 +122,7 @@ paths: - GA4GH TES /ga4gh/tes/v1/tasks/{task_id}: get: - operationId: get_task_ga4gh_tes_v1_tasks__task_id__get + operationId: get_task parameters: - in: path name: task_id @@ -149,7 +149,7 @@ paths: - GA4GH TES /ga4gh/tes/v1/tasks/{task_id}:cancel: post: - operationId: cancel_task_ga4gh_tes_v1_tasks__task_id__cancel_post + operationId: cancel_task parameters: - in: path name: task_id @@ -174,9 +174,258 @@ paths: summary: Cancel Task tags: - GA4GH TES + /s3/{path}: + delete: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 + get: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 + head: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 + options: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 + patch: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 + post: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 + put: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 + trace: + description: 'Receive incoming S3 requests, re-sign them (AWS Signature Version + 4 algorithm) with the + + appropriate credentials to access the current user''s AWS S3 bucket, and forward + them to + + AWS S3.' + operationId: s3_endpoint + parameters: + - in: path + name: path + required: true + schema: + title: Path + type: string + responses: + '200': + content: + application/json: + schema: {} + description: Successful Response + '422': + content: + application/json: + schema: + $ref: '#/components/schemas/HTTPValidationError' + description: Validation Error + summary: S3 Endpoint + tags: + - S3 /storage/credentials: get: - operationId: get_user_keys_storage_credentials_get + operationId: get_user_keys responses: '200': content: @@ -189,7 +438,7 @@ paths: tags: - Storage post: - operationId: generate_user_key_storage_credentials_post + operationId: generate_user_key responses: '201': content: @@ -203,7 +452,7 @@ paths: - Storage /storage/credentials/{key_id}: delete: - operationId: delete_user_key_storage_credentials__key_id__delete + operationId: delete_user_key parameters: - in: path name: key_id @@ -227,7 +476,7 @@ paths: - Storage /storage/info: get: - operationId: get_storage_info_storage_info_get + operationId: get_storage_info responses: '200': content: diff --git a/docs/s3.md b/docs/s3.md new file mode 100644 index 0000000..c15ba4f --- /dev/null +++ b/docs/s3.md @@ -0,0 +1,72 @@ +# S3 interaction + +Note: This discussion can apply to many use cases, but it is written with a specific use case in mind: using the Gen3Workflow service to run Nextflow workflows. + +Contents: +- [Using IAM keys](#using-iam-keys) +- [Using a custom S3 endpoint](#using-a-custom-s3-endpoint) +- [Diagram](#diagram) + +## Using IAM keys + +We initially considered generating IAM keys for users to upload their input files to S3, retrieve their output files and store Nextflow intermediary files. Users would configure Nextflow with the generated IAM key ID and secret: + +``` +plugins { + id 'nf-ga4gh' +} +process { + executor = 'tes' + container = 'quay.io/nextflow/bash' +} +tes { + endpoint = '/ga4gh/tes' + oauthToken = "${GEN3_TOKEN}" +} +aws { + accessKey = "${AWS_KEY_ID}" + secretKey = "${AWS_KEY_SECRET}" + region = 'us-east-1' +} +workDir = '' +``` + +Plain-text AWS IAM keys in users' hands causes security concerns. It creates a difficult path for auditing and traceability. The ability to easily see the secrets in plain-text is also a concern. + +## Using a custom S3 endpoint + +The `/s3` endpoint was implemented to avoid using IAM keys. This endpoint receives S3 requests, re-signs them with internal credentials, and forwards them to AWS S3. Users provide their Gen3 token as the “access key ID”, which is used to verify they have the appropriate access. This key is then overwritten with internal credentials that actually have access to AWS S3. + +Nextflow supports S3-compatible storage through the `aws.client.s3PathStyleAccess` and `aws.client.endpoint` settings, this allows users to point Nextflow to our custom S3 API: + +``` +plugins { + id 'nf-ga4gh' +} +process { + executor = 'tes' + container = 'quay.io/nextflow/bash' +} +tes { + endpoint = '/ga4gh/tes' + oauthToken = "${GEN3_TOKEN}" +} +aws { + accessKey = "${GEN3_TOKEN}" + secretKey = 'N/A' + region = 'us-east-1' + client { + s3PathStyleAccess = true + endpoint = '/s3' + } +} +workDir = '' +``` + +Notes: +- We have to set the Gen3 token as the “key ID”, not the “key secret”, in order to extract it from the request. The “key secret” is hashed and cannot be extracted. +- When an `aws.accessKey` value is provided, the Nextflow configuration requires the `aws.secretKey` value to be provided as well. Users can set it to something like "N/A". + +## Diagram + +![s3 interaction diagram](s3.png) diff --git a/docs/s3.png b/docs/s3.png new file mode 100644 index 0000000..6627900 Binary files /dev/null and b/docs/s3.png differ diff --git a/gen3workflow/app.py b/gen3workflow/app.py index 4a73dd9..13b0a77 100644 --- a/gen3workflow/app.py +++ b/gen3workflow/app.py @@ -9,6 +9,7 @@ from gen3workflow import logger from gen3workflow.config import config from gen3workflow.routes.ga4gh_tes import router as ga4gh_tes_router +from gen3workflow.routes.s3 import router as s3_router from gen3workflow.routes.storage import router as storage_router from gen3workflow.routes.system import router as system_router @@ -28,6 +29,7 @@ def get_app(httpx_client=None) -> FastAPI: ) app.async_client = httpx_client or httpx.AsyncClient() app.include_router(ga4gh_tes_router, tags=["GA4GH TES"]) + app.include_router(s3_router, tags=["S3"]) app.include_router(storage_router, tags=["Storage"]) app.include_router(system_router, tags=["System"]) diff --git a/gen3workflow/aws_utils.py b/gen3workflow/aws_utils.py index e98e8ae..9b30549 100644 --- a/gen3workflow/aws_utils.py +++ b/gen3workflow/aws_utils.py @@ -174,7 +174,6 @@ def delete_iam_user_key(user_id, key_id): ) except ClientError as e: if e.response["Error"]["Code"] == "NoSuchEntity": - raise HTTPException( - HTTP_404_NOT_FOUND, - f"No such key: '{key_id}'", - ) + err_msg = f"No such key: '{key_id}'" + logger.error(err_msg) + raise HTTPException(HTTP_404_NOT_FOUND, err_msg) diff --git a/gen3workflow/config-default.yaml b/gen3workflow/config-default.yaml index 270384d..53330db 100644 --- a/gen3workflow/config-default.yaml +++ b/gen3workflow/config-default.yaml @@ -16,6 +16,11 @@ MAX_IAM_KEYS_PER_USER: 2 # the default AWS AccessKeysPerUser quota is 2 IAM_KEYS_LIFETIME_DAYS: 30 USER_BUCKETS_REGION: us-east-1 +# configure an AWS IAM key to use when making S3 requests on behalf of users. If left empty, it +# is assumed there is an existing STS session we can get credentials from. +S3_ENDPOINTS_AWS_ACCESS_KEY_ID: +S3_ENDPOINTS_AWS_SECRET_ACCESS_KEY: + ############# # DATABASE # ############# @@ -27,8 +32,6 @@ DB_USER: postgres DB_PASSWORD: postgres DB_DATABASE: gen3workflow_test - - ############# # GA4GH TES # ############# diff --git a/gen3workflow/config.py b/gen3workflow/config.py index fc747da..08862b6 100644 --- a/gen3workflow/config.py +++ b/gen3workflow/config.py @@ -48,6 +48,8 @@ def validate_top_level_configs(self): "MAX_IAM_KEYS_PER_USER": {"type": "integer", "maximum": 100}, "IAM_KEYS_LIFETIME_DAYS": {"type": "integer"}, "USER_BUCKETS_REGION": {"type": "string"}, + "S3_ENDPOINTS_AWS_ACCESS_KEY_ID": {"type": ["string", "null"]}, + "S3_ENDPOINTS_AWS_SECRET_ACCESS_KEY": {"type": ["string", "null"]}, "ARBORIST_URL": {"type": ["string", "null"]}, "TASK_IMAGE_WHITELIST": {"type": "array", "items": {"type": "string"}}, "TES_SERVER_URL": {"type": "string"}, @@ -55,6 +57,10 @@ def validate_top_level_configs(self): } validate(instance=self, schema=schema) + assert bool(self["S3_ENDPOINTS_AWS_ACCESS_KEY_ID"]) == bool( + self["S3_ENDPOINTS_AWS_SECRET_ACCESS_KEY"] + ), "Both 'S3_ENDPOINTS_AWS_ACCESS_KEY_ID' and 'S3_ENDPOINTS_AWS_SECRET_ACCESS_KEY' must be configured, or both must be left empty" + config = Gen3WorkflowConfig(DEFAULT_CFG_PATH) try: diff --git a/gen3workflow/routes/ga4gh_tes.py b/gen3workflow/routes/ga4gh_tes.py index 160c03a..ea1aafe 100644 --- a/gen3workflow/routes/ga4gh_tes.py +++ b/gen3workflow/routes/ga4gh_tes.py @@ -99,10 +99,9 @@ async def create_task(request: Request, auth=Depends(Auth)): invalid_images = get_non_allowed_images(images_from_request, username) if invalid_images: - raise HTTPException( - HTTP_403_FORBIDDEN, - f"The specified images are not allowed: {list(invalid_images)}", - ) + err_msg = f"The specified images are not allowed: {list(invalid_images)}" + logger.error(f"{err_msg}. Allowed images: {config['TASK_IMAGE_WHITELIST']}") + raise HTTPException(HTTP_403_FORBIDDEN, err_msg) if "tags" not in body: body["tags"] = {} @@ -119,6 +118,7 @@ async def create_task(request: Request, auth=Depends(Auth)): username=username, user_id=user_id ) except ArboristError as e: + logger.error(e.message) raise HTTPException(e.code, e.message) return res.json() @@ -195,6 +195,7 @@ async def list_tasks(request: Request, auth=Depends(Auth)): }, ) except ArboristError as e: + logger.error(e.message) raise HTTPException(e.code, e.message) # filter out tasks the current user does not have access to diff --git a/gen3workflow/routes/s3.py b/gen3workflow/routes/s3.py new file mode 100644 index 0000000..1d47d58 --- /dev/null +++ b/gen3workflow/routes/s3.py @@ -0,0 +1,204 @@ +import hashlib +import urllib.parse + +import boto3 +from fastapi import APIRouter, HTTPException, Request +from fastapi.security import HTTPAuthorizationCredentials +from botocore.credentials import Credentials +import hmac +from starlette.datastructures import Headers +from starlette.responses import Response +from starlette.status import HTTP_401_UNAUTHORIZED + +from gen3workflow import aws_utils, logger +from gen3workflow.auth import Auth +from gen3workflow.config import config + + +router = APIRouter(prefix="/s3") + + +def get_access_token(headers: Headers) -> str: + """ + Extract the user's access token, which should have been provided as the key ID, from the + Authorization header in the following expected format: + `AWS4-HMAC-SHA256 Credential=////aws4_request, SignedHeaders=<>, Signature=<>` + + Args: + headers (Headers): request headers + + Returns: + str: the user's access token or "" if not found + """ + auth_header = headers.get("authorization") + if not auth_header: + return "" + try: + return auth_header.split("Credential=")[1].split("/")[0] + except Exception as e: + logger.error( + f"Unexpected format; unable to extract access token from authorization header: {e}" + ) + return "" + + +def get_signature_key(key: str, date: str, region_name: str, service_name: str) -> str: + """ + Create a signing key using the AWS Signature Version 4 algorithm. + """ + key_date = hmac.new( + f"AWS4{key}".encode("utf-8"), date.encode("utf-8"), hashlib.sha256 + ).digest() + key_region = hmac.new( + key_date, region_name.encode("utf-8"), hashlib.sha256 + ).digest() + key_service = hmac.new( + key_region, service_name.encode("utf-8"), hashlib.sha256 + ).digest() + key_signing = hmac.new(key_service, b"aws4_request", hashlib.sha256).digest() + return key_signing + + +@router.api_route( + "/{path:path}", + methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH", "TRACE", "HEAD"], +) +async def s3_endpoint(path: str, request: Request): + """ + Receive incoming S3 requests, re-sign them (AWS Signature Version 4 algorithm) with the + appropriate credentials to access the current user's AWS S3 bucket, and forward them to + AWS S3. + """ + logger.debug(f"Incoming S3 request: '{request.method} {path}'") + + # extract the user's access token from the request headers, and ensure the user has access + # to run workflows + auth = Auth(api_request=request) + auth.bearer_token = HTTPAuthorizationCredentials( + scheme="bearer", credentials=get_access_token(request.headers) + ) + await auth.authorize("create", ["/services/workflow/gen3-workflow/tasks"]) + + # get the name of the user's bucket and ensure the user is making a call to their own bucket + token_claims = await auth.get_token_claims() + user_id = token_claims.get("sub") + user_bucket = aws_utils.get_safe_name_from_user_id(user_id) + request_bucket = path.split("?")[0].split("/")[0] + if request_bucket != user_bucket: + err_msg = f"'{path}' not allowed. You can make calls to your personal bucket, '{user_bucket}'" + logger.error(err_msg) + raise HTTPException(HTTP_401_UNAUTHORIZED, err_msg) + + # extract the request path (used in the canonical request) and the API endpoint (used to make + # the request to AWS). + # Example 1: + # - path = my-bucket// + # - request_path = // + # - api_endpoint = / + # Example 2: + # - path = my-bucket/pre/fix/ + # - request_path = /pre/fix/ + # - api_endpoint = pre/fix/ + request_path = path.split(user_bucket)[1] + api_endpoint = "/".join(request_path.split("/")[1:]) + + body = await request.body() + body_hash = hashlib.sha256(body).hexdigest() + timestamp = request.headers["x-amz-date"] + date = timestamp[:8] # the date portion (YYYYMMDD) of the timestamp + region = config["USER_BUCKETS_REGION"] + service = "s3" + + # generate the request headers: + # - first, copy all the headers from the original request. + headers = dict(request.headers) + # - remove the `authorization` header: it contains a Gen3 token instead of an AWS IAM key. + # The new `authorization` header will be added _after_ generating the signature. + headers.pop("authorization") + # - overwrite the `x-amz-content-sha256` header value with the body hash. When this header is + # set to "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" in the original request (payload sent over + # multiple chunks), we replace it with the body hash (because I couldn't get the signing to + # work for "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" - I believe it requires using the signature + # from the previous chunk). + # NOTE: This may cause issues when large files are _actually_ uploaded over multiple chunks. + headers["x-amz-content-sha256"] = body_hash + # - remove the `content-md5` header: when the `x-amz-content-sha256` header is overwritten (see + # above), the original `content-md5` value becomes incorrect. It's not required in V4 signing. + headers.pop("content-md5", None) + # - replace the `host` header, since we are re-signing and sending to a different host. + headers["host"] = f"{user_bucket}.s3.amazonaws.com" + + # get AWS credentials from the configuration or the current assumed role session + if config["S3_ENDPOINTS_AWS_ACCESS_KEY_ID"]: + credentials = Credentials( + access_key=config["S3_ENDPOINTS_AWS_ACCESS_KEY_ID"], + secret_key=config["S3_ENDPOINTS_AWS_SECRET_ACCESS_KEY"], + ) + else: # assume the service is running in k8s: get credentials from the assumed role + session = boto3.Session() + credentials = session.get_credentials() + assert credentials, "No AWS credentials found" + headers["x-amz-security-token"] = credentials.token + + # construct the canonical request + canonical_headers = "".join( + f"{key}:{headers[key]}\n" for key in sorted(list(headers.keys())) + ) + signed_headers = ";".join(sorted([k.lower() for k in headers.keys()])) + query_params = dict(request.query_params) + # the query params in the canonical request have to be sorted: + query_params_names = sorted(list(query_params.keys())) + canonical_query_params = "&".join( + f"{urllib.parse.quote_plus(key)}={urllib.parse.quote_plus(query_params[key])}" + for key in query_params_names + ) + canonical_request = ( + f"{request.method}\n" + f"{request_path}\n" + f"{canonical_query_params}\n" + f"{canonical_headers}" + f"\n" + f"{signed_headers}\n" + f"{body_hash}" + ) + + # construct the string to sign based on the canonical request + string_to_sign = ( + f"AWS4-HMAC-SHA256\n" + f"{timestamp}\n" + f"{date}/{region}/{service}/aws4_request\n" # credential scope + f"{hashlib.sha256(canonical_request.encode('utf-8')).hexdigest()}" # canonical request hash + ) + + # generate the signing key, and generate the signature by signing the string to sign with the + # signing key + signing_key = get_signature_key(credentials.secret_key, date, region, service) + signature = hmac.new( + signing_key, string_to_sign.encode("utf-8"), hashlib.sha256 + ).hexdigest() + + # construct the Authorization header from the credentials and the signature, and forward the + # call to AWS S3 with the new Authorization header + headers["authorization"] = ( + f"AWS4-HMAC-SHA256 Credential={credentials.access_key}/{date}/{region}/{service}/aws4_request, SignedHeaders={signed_headers}, Signature={signature}" + ) + s3_api_url = f"https://{user_bucket}.s3.amazonaws.com/{api_endpoint}" + logger.debug(f"Outgoing S3 request: '{request.method} {s3_api_url}'") + response = await request.app.async_client.request( + method=request.method, + url=s3_api_url, + headers=headers, + params=query_params, + data=body, + ) + if response.status_code != 200: + logger.error(f"Error from AWS: {response.status_code} {response.text}") + + # return the response from AWS S3 + if "Content-Type" in response.headers: + return Response( + content=response.content, + status_code=response.status_code, + media_type=response.headers["Content-Type"], + ) + return Response(content=response.content, status_code=response.status_code) diff --git a/gen3workflow/routes/storage.py b/gen3workflow/routes/storage.py index c1cb3d9..a7d354c 100644 --- a/gen3workflow/routes/storage.py +++ b/gen3workflow/routes/storage.py @@ -8,9 +8,9 @@ HTTP_400_BAD_REQUEST, ) +from gen3workflow import aws_utils, logger from gen3workflow.auth import Auth from gen3workflow.config import config -from gen3workflow import aws_utils router = APIRouter(prefix="/storage") @@ -35,10 +35,9 @@ async def generate_user_key(request: Request, auth=Depends(Auth)): existing_keys = aws_utils.list_iam_user_keys(user_id) if len(existing_keys) >= config["MAX_IAM_KEYS_PER_USER"]: - raise HTTPException( - HTTP_400_BAD_REQUEST, - f"Too many existing keys: only {config['MAX_IAM_KEYS_PER_USER']} are allowed per user. Delete an existing key before creating a new one", - ) + err_msg = f"Too many existing keys: only {config['MAX_IAM_KEYS_PER_USER']} are allowed per user. Delete an existing key before creating a new one" + logger.error(err_msg) + raise HTTPException(HTTP_400_BAD_REQUEST, err_msg) key_id, key_secret = aws_utils.create_iam_user_and_key(user_id) return { diff --git a/run.py b/run.py index c9fa102..778e903 100644 --- a/run.py +++ b/run.py @@ -6,6 +6,8 @@ import os import sys + +from fastapi.routing import APIRoute import uvicorn import yaml @@ -15,9 +17,29 @@ CURRENT_DIR = os.path.dirname(os.path.realpath(__file__)) +def overwrite_openapi_operation_ids(app) -> None: + """ + The default operation ID format is `__`. + A bug is causing the operation IDs for the `/s3` endpoint, which accepts all methods, to not + be generated properly. This ensures unique operation IDs are generated for all routes. + """ + existing_routes = set() + for route in app.routes: + if not isinstance(route, APIRoute): + continue + route.operation_id = route.name + i = 2 + while route.operation_id in existing_routes: + route.operation_id = f"{route.name}_{i}" + i += 1 + existing_routes.add(route.operation_id) + + if __name__ == "__main__": - if sys.argv[-1] == "openapi": - schema = get_app().openapi() + if sys.argv[-1] == "openapi": # generate openapi docs + app = get_app() + overwrite_openapi_operation_ids(app) + schema = app.openapi() path = os.path.join(CURRENT_DIR, "docs/openapi.yaml") yaml.Dumper.ignore_aliases = lambda *args: True with open(path, "w+") as f: diff --git a/tests/conftest.py b/tests/conftest.py index 8a6339a..f6e112f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,8 @@ """ import asyncio +from datetime import datetime +from dateutil.tz import tzutc import json import os from unittest.mock import MagicMock, patch @@ -14,6 +16,8 @@ import pytest_asyncio from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from starlette.config import environ +from threading import Thread +import uvicorn # Set GEN3WORKFLOW_CONFIG_PATH *before* loading the app, which loads the configuration CURRENT_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -29,6 +33,36 @@ TEST_USER_ID = "64" NEW_TEST_USER_ID = "784" # a new user that does not already exist in arborist +# a "ListBucketResult" S3 response from AWS, and the corresponding response as parsed by boto3 +MOCKED_S3_RESPONSE_XML = f"""\ngen3wf-{config['HOSTNAME']}-{TEST_USER_ID}test-folder/test-file1.txt250urlfalsetest-folder/test-file1.txt2024-12-09T22:32:20.000Z"something"211somethingsomethingSTANDARD""" +MOCKED_S3_RESPONSE_DICT = { + "ResponseMetadata": { + "HTTPStatusCode": 200, + "HTTPHeaders": { + "server": "uvicorn", + "content-length": "569", + "content-type": "application/xml", + }, + "RetryAttempts": 0, + }, + "IsTruncated": False, + "Marker": "", + "Contents": [ + { + "Key": "test-folder/test-file1.txt", + "LastModified": datetime(2024, 12, 9, 22, 32, 20, tzinfo=tzutc()), + "ETag": '"something"', + "Size": 211, + "StorageClass": "STANDARD", + "Owner": {"DisplayName": "something", "ID": "something"}, + } + ], + "Name": f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}", + "Prefix": "test-folder/test-file1.txt", + "MaxKeys": 250, + "EncodingType": "url", +} + @pytest_asyncio.fixture(scope="function") async def engine(): @@ -71,7 +105,7 @@ async def session(engine): @pytest.fixture(scope="function") -def access_token_patcher(client, request): +def access_token_patcher(request): """ The `access_token` function will return a token linked to a test user. This fixture should be used explicitely instead of the automatic @@ -254,6 +288,7 @@ async def handle_request(request: Request): parsed_url = urlparse(url) mocked_response = None if url.startswith(config["TES_SERVER_URL"]): + # mock calls to the TES server path = url[len(config["TES_SERVER_URL"]) :].split("?")[0].rstrip("/") mocked_response = mock_tes_server_request( method=request.method, @@ -263,6 +298,7 @@ async def handle_request(request: Request): status_code=tes_resp_code, ) elif url.startswith(config["ARBORIST_URL"]): + # mock calls to Arborist path = url[len(config["ARBORIST_URL"]) :].split("?")[0].rstrip("/") mocked_response = mock_arborist_request( method=request.method, @@ -270,6 +306,15 @@ async def handle_request(request: Request): body=request.content.decode(), authorized=authorized, ) + elif url.startswith( + f"https://gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}.s3.amazonaws.com" + ): + # mock calls to AWS S3 + mocked_response = httpx.Response( + status_code=200, + text=MOCKED_S3_RESPONSE_XML, + headers={"content-type": "application/xml"}, + ) if mocked_response is not None: print(f"Mocking request '{request.method} {url}'") @@ -287,11 +332,29 @@ async def handle_request(request: Request): transport=httpx.MockTransport(handle_request) ) - # the tests use a real httpx client that forwards requests to the app - async with httpx.AsyncClient( - app=app, base_url="http://test-gen3-wf" - ) as real_httpx_client: - # for easier access to the param in the tests - real_httpx_client.tes_resp_code = tes_resp_code - real_httpx_client.authorized = authorized - yield real_httpx_client + get_url = False + if hasattr(request, "param"): + get_url = request.param.get("get_url", get_url) + + if get_url: # for tests that need to hit the app URL directly + host = "0.0.0.0" + port = 8080 + + def run_uvicorn(): + uvicorn.run(app, host=host, port=port) + + # start the app in a separate thread + thread = Thread(target=run_uvicorn) + thread.daemon = True # ensures the thread ends when the test ends + thread.start() + + yield f"http://{host}:{port}" # URL to use in the tests + else: + # the tests use a real httpx client that forwards requests to the app + async with httpx.AsyncClient( + app=app, base_url="http://test-gen3-wf" + ) as real_httpx_client: + # for easier access to the param in the tests + real_httpx_client.tes_resp_code = tes_resp_code + real_httpx_client.authorized = authorized + yield real_httpx_client diff --git a/tests/test-gen3workflow-config.yaml b/tests/test-gen3workflow-config.yaml index fb37faa..b3aff15 100644 --- a/tests/test-gen3workflow-config.yaml +++ b/tests/test-gen3workflow-config.yaml @@ -1,6 +1,9 @@ DEBUG: true ARBORIST_URL: http://test-arborist-server +S3_ENDPOINTS_AWS_ACCESS_KEY_ID: test-aws-key-id +S3_ENDPOINTS_AWS_SECRET_ACCESS_KEY: test-aws-key + TES_SERVER_URL: http://external-tes-server/tes TASK_IMAGE_WHITELIST: - public.ecr.aws/random/approved/public:* diff --git a/tests/test_s3_endpoint.py b/tests/test_s3_endpoint.py new file mode 100644 index 0000000..d2f5f13 --- /dev/null +++ b/tests/test_s3_endpoint.py @@ -0,0 +1,76 @@ +import boto3 +from botocore.config import Config +from botocore.exceptions import ClientError +import pytest + +from conftest import MOCKED_S3_RESPONSE_DICT, TEST_USER_ID +from gen3workflow.config import config + + +@pytest.fixture() +def s3_client(client): + """ + Return an S3 client configured to talk to the gen3-workflow `/s3` endpoint. + """ + session = boto3.session.Session() + return session.client( + service_name="s3", + aws_access_key_id=config["S3_ENDPOINTS_AWS_ACCESS_KEY_ID"], + aws_secret_access_key=config["S3_ENDPOINTS_AWS_SECRET_ACCESS_KEY"], + endpoint_url=f"{client}/s3", + # no retries; only try each call once: + config=Config(retries={"max_attempts": 0}), + ) + + +@pytest.mark.parametrize("client", [{"get_url": True}], indirect=True) +def test_s3_endpoint(s3_client, access_token_patcher): + """ + Hitting the `/s3` endpoint should result in the request being forwarded to AWS S3. + """ + res = s3_client.list_objects(Bucket=f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}") + res.get("ResponseMetadata", {}).get("HTTPHeaders", {}).pop("date", None) + assert res == MOCKED_S3_RESPONSE_DICT + + +@pytest.mark.parametrize("client", [{"get_url": True}], indirect=True) +def test_s3_endpoint_no_token(s3_client): + """ + Hitting the `/s3` endpoint without a Gen3 access token should result in a 401 Unauthorized + error. + """ + with pytest.raises(ClientError, match="Unauthorized"): + s3_client.list_objects(Bucket=f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}") + + +""" +This test currently doesn't work because the client generated when `get_url` is True is not stopped +properly, so generating a different client (with `authorized=False` param) triggers an error: +> OSError: [Errno 48] error while attempting to bind on address ('0.0.0.0', 8080): address already + in use +TODO fix that +""" +# @pytest.mark.parametrize("client", [{"get_url": True, "authorized": False}], indirect=True) +# def test_s3_endpoint_unauthorized(s3_client, access_token_patcher): +# """ +# Hitting the `/s3` endpoint with a Gen3 access token that does not have the appropriate access +# should result in a 403 Forbidden error. +# """ +# with pytest.raises(ClientError, match="403"): +# s3_client.list_objects(Bucket=f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}") + + +@pytest.mark.parametrize("client", [{"get_url": True}], indirect=True) +@pytest.mark.parametrize( + "bucket_name", + ["not-the-user-s-bucket", f"gen3wf-{config['HOSTNAME']}-{TEST_USER_ID}-2"], +) +def test_s3_endpoint_wrong_bucket(s3_client, access_token_patcher, bucket_name): + """ + Hitting the `/s3` endpoint with a bucket that is not the bucket generated by gen3-workflow for + the current user should result in a 401 Unauthorized error. + Specific edge case: if the user's bucket is "gen3wf--", a bucket name which + is a superstring of that, such as "gen3wf---2", should not be allowed. + """ + with pytest.raises(ClientError, match="Unauthorized"): + s3_client.list_objects(Bucket=bucket_name)