From a06d52e26333b88ae90c515acff70dfb6e5feedc Mon Sep 17 00:00:00 2001 From: Holly Gong Date: Mon, 26 Aug 2024 14:44:24 +1000 Subject: [PATCH 1/6] chore: add PR title check to follow Git commit convention --- .github/workflows/pr-check.yml | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/pr-check.yml diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml new file mode 100644 index 00000000000..9e8ae6c231e --- /dev/null +++ b/.github/workflows/pr-check.yml @@ -0,0 +1,35 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: OSV PR format check + +on: + # `pull_request_target` is only required when editing PRs from forks. + pull_request: + types: + - opened + - edited + - reopened + +permissions: + pull-requests: read + +jobs: + title-check: + name: Validate PR title + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file From abfc0c3b90e3fb465ef1758b48f737302075a1e1 Mon Sep 17 00:00:00 2001 From: Holly Gong Date: Fri, 30 Aug 2024 15:18:46 +1000 Subject: [PATCH 2/6] feat: add a cron job to test OSV API on test instance --- deployment/build-and-stage.yaml | 11 + .../oss-vdb-test/kustomization.yaml | 1 + .../oss-vdb-test/staging-test.yaml | 26 + docker/staging_test/Dockerfile | 30 ++ docker/staging_test/build.sh | 19 + docker/staging_test/perform_api_calls.py | 461 ++++++++++++++++++ docker/staging_test/retrieve_bugs_from_db.py | 115 +++++ docker/staging_test/run.sh | 23 + 8 files changed, 686 insertions(+) create mode 100644 deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml create mode 100644 docker/staging_test/Dockerfile create mode 100755 docker/staging_test/build.sh create mode 100755 docker/staging_test/perform_api_calls.py create mode 100644 docker/staging_test/retrieve_bugs_from_db.py create mode 100644 docker/staging_test/run.sh diff --git a/deployment/build-and-stage.yaml b/deployment/build-and-stage.yaml index 942beee7208..f25c7198570 100644 --- a/deployment/build-and-stage.yaml +++ b/deployment/build-and-stage.yaml @@ -98,6 +98,15 @@ steps: args: ['push', '--all-tags', 'gcr.io/oss-vdb/alias-computation'] waitFor: ['build-alias-computation', 'cloud-build-queue'] +- name: gcr.io/cloud-builders/docker + args: ['build', '-t', 'gcr.io/oss-vdb/staging-test:latest', '-t', 'gcr.io/oss-vdb/staging-test:$COMMIT_SHA', '.'] + dir: 'docker/staging_test' + id: 'build-staging-test' + waitFor: ['build-worker'] +- name: gcr.io/cloud-builders/docker + args: ['push', '--all-tags', 'gcr.io/oss-vdb/staging-test'] + waitFor: ['build-staging-test', 'cloud-build-queue'] + # Build/push cron job images. - name: gcr.io/cloud-builders/docker args: ['build', '-t', 'gcr.io/oss-vdb/cron:latest', '-t', 'gcr.io/oss-vdb/cron:$COMMIT_SHA', '.'] @@ -263,6 +272,7 @@ steps: importer=gcr.io/oss-vdb/importer:$COMMIT_SHA,\ exporter=gcr.io/oss-vdb/exporter:$COMMIT_SHA,\ alias-computation=gcr.io/oss-vdb/alias-computation:$COMMIT_SHA,\ + staging-test=gcr.io/oss-vdb/staging-test:$COMMIT_SHA,\ cron=gcr.io/oss-vdb/cron:$COMMIT_SHA,\ debian-convert=gcr.io/oss-vdb/debian-convert:$COMMIT_SHA,\ combine-to-osv=gcr.io/oss-vdb/combine-to-osv:$COMMIT_SHA,\ @@ -314,6 +324,7 @@ images: - 'gcr.io/oss-vdb/importer:$COMMIT_SHA' - 'gcr.io/oss-vdb/exporter:$COMMIT_SHA' - 'gcr.io/oss-vdb/alias-computation:$COMMIT_SHA' +- 'gcr.io/oss-vdb/staging-test:$COMMIT_SHA' - 'gcr.io/oss-vdb/cron:$COMMIT_SHA' - 'gcr.io/oss-vdb/alpine-cve-convert:$COMMIT_SHA' - 'gcr.io/oss-vdb/debian-cve-convert:$COMMIT_SHA' diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml index 1c7656f3b7a..eb4b6019790 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml @@ -1,5 +1,6 @@ resources: - ../../base +- staging-test.yaml patches: - path: workers.yaml - path: scaler.yaml diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml new file mode 100644 index 00000000000..59f9dca49cf --- /dev/null +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml @@ -0,0 +1,26 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: staging-test +spec: + schedule: "50 * * * *" + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - name: staging-test + image: staging-test + imagePullPolicy: Always + env: + - name: GOOGLE_CLOUD_PROJECT + value: "oss-vdb-test" + resources: + requests: + cpu: 1 + memory: "3G" + limits: + cpu: 1 + memory: "4G" + restartPolicy: Never \ No newline at end of file diff --git a/docker/staging_test/Dockerfile b/docker/staging_test/Dockerfile new file mode 100644 index 00000000000..a2cb54b9f5c --- /dev/null +++ b/docker/staging_test/Dockerfile @@ -0,0 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM gcr.io/oss-vdb/worker + +WORKDIR /staging_testing + +COPY retrieve_bugs_from_db.py /staging_testing +COPY perform_api_calls.py /staging_testing +COPY run.sh /staging_testing + +# Add aiohttp lib +RUN cd /env/docker/worker && POETRY_VIRTUALENVS_CREATE=false poetry add aiohttp + +RUN chmod 755 /staging_testing/retrieve_bugs_from_db.py +RUN chmod 755 /staging_testing/perform_api_calls.py +RUN chmod 755 /staging_testing/run.sh + +ENTRYPOINT ["/staging_testing/run.sh"] diff --git a/docker/staging_test/build.sh b/docker/staging_test/build.sh new file mode 100755 index 00000000000..8421fa3f69b --- /dev/null +++ b/docker/staging_test/build.sh @@ -0,0 +1,19 @@ +#!/bin/bash -x +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +docker build -t gcr.io/oss-vdb/staging-test:$1 . && \ +docker build -t gcr.io/oss-vdb/staging-test:latest . && \ +docker push gcr.io/oss-vdb/staging-test:$1 && \ +docker push gcr.io/oss-vdb/staging-test:latest diff --git a/docker/staging_test/perform_api_calls.py b/docker/staging_test/perform_api_calls.py new file mode 100755 index 00000000000..f4e4a536212 --- /dev/null +++ b/docker/staging_test/perform_api_calls.py @@ -0,0 +1,461 @@ +#!/usr/bin/env python3 +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Mock API queries and send them to the test API endpoint for +performance testing. It is recommended to use two terminals to +run this script concurrently to generate sufficient traffic.""" + +import logging +import aiohttp +import asyncio +import os +import random +import sys +import time +import json + +from collections import Counter, defaultdict +from typing import Callable + +import osv +import osv.logs + +BASE_URL = 'https://api.test.osv.dev/v1' +GCP_PROJECT = 'oss-vdb-test' +BUG_DIR = '/staging_testing/all_bugs' + +# Total run time in seconds +TOTAL_RUNTIME = 3600 +# Execute all pending batch size requests within the specified time interval. +FREQUENCY_IN_SECONDS = 1 + +# Number of `vulnerability get` requests to send per second +VULN_QUERY_BATCH_SIZE = 50 +# Number of `version query` requests to send per second +VERSION_QUERY_BATCH_SIZE = 100 +# Number of `package query` requests to send per second +PACKAGE_QUERY_BATCH_SIZE = 30 +# Number of `batch query` requests to send per second +BATCH_QUERY_BATCH_SIZE = 3 +# Number of large `batch query` requests to send per second +LARGE_BATCH_QUERY_BATCH_SIZE = 2 + + +class SimpleBug: + """A simplified bug only contains essential information + for making HTTP requests.""" + + def __init__(self, bug_dict: dict): + self.db_id = bug_dict['db_id'] + # If the package/ecosystem/version value is None, then add a fake value in. + if not bug_dict['project']: + self.packages = 'foo' + else: + self.packages = list(bug_dict['project']) + self.purl = bug_dict['purl'] + if not bug_dict['ecosystem']: + self.ecosystems = 'foo' + else: + self.ecosystems = list(bug_dict['ecosystem']) + + # Use the `affected fuzzy` value as the query version. + # If no 'affected fuzzy' is present, assign a default value. + self.affected_fuzzy = bug_dict['affected_fuzzy'] + if not self.affected_fuzzy: + self.affected_fuzzy = '1.0.0' + + +def read_from_json(filename: str, ecosystem_map: defaultdict, bug_map: dict, + package_map: defaultdict) -> None: + """Loads bugs from one JSON file into bug dicts. + + Args: + filename: the JSON filename. + + ecosystem_map: + A defaultdict mapping ecosystem names to their bugs. For example: + {'Maven': (CVE-XXXX-XXXX, CVE-XXXX-XXXX), 'PyPI': ()} + + bug_map: + A dict mapping bug ID to its `SimpleBug` object. For example: + {'CVE-XXXX-XXXX,': SimpleBug{}} + + package_map: + A defaultdict mapping package names to their bugs. For example: + {'tensorflow': (CVE-XXXX-XXXX, CVE-XXXX-XXXX), 'curl': ()} + + Returns: + None + """ + with open(filename, "r") as f: + json_file = json.load(f) + for bug_data in json_file: + bug = SimpleBug(bug_data) + for ecosystem in bug.ecosystems: + ecosystem_map[ecosystem].add(bug.db_id) + for package in bug.packages: + package_map[package].add(bug.db_id) + bug_map[bug.db_id] = bug + + +def load_all_bugs() -> tuple[defaultdict, dict, defaultdict]: + """Loads bugs from JSON directory + + Returns: + A defaultdict mapping ecosystem names to their bugs. For example: + {'Maven': (CVE-XXXX-XXXX, CVE-XXXX-XXXX), 'PyPI': ()} + + A dict mapping bug ID to its `SimpleBug` object. For example: + {'CVE-XXXX-XXXX,': SimpleBug{}} + + A defaultdict mapping package names to their bugs. For example: + {'tensorflow': (CVE-XXXX-XXXX, CVE-XXXX-XXXX), 'curl': ()} + """ + + ecosystem_map = defaultdict(set) + bug_map = {} + package_map = defaultdict(set) + for filename in os.listdir(BUG_DIR): + if filename.endswith('.json'): + file_path = os.path.join(BUG_DIR, filename) + read_from_json(file_path, ecosystem_map, bug_map, package_map) + return ecosystem_map, bug_map, package_map + + +async def make_http_request(session: aiohttp.ClientSession, request_url: str, + request_type: str, request_body: dict) -> None: + """Makes one HTTP request + + Args: + session: + The HTTP ClientSession + request_url: + The HTTP request URL + request_type: + The HTTP request type: `GET` or `POST` + request_body: + The HTTP request body in JSON format + """ + try: + timeout = aiohttp.ClientTimeout(total=None, sock_connect=80, sock_read=80) + if request_type == 'GET': + async with session.get(request_url): + pass # We're not awaiting the response, just sending the request + elif request_type == 'POST': + async with session.post(request_url, json=request_body, timeout=timeout): + # print(f'request: {request_body}, response: {response.status}') + pass # We're not awaiting the response, just sending the request + except Exception as e: + logging.warning(f'Error sending request {request_url} with body' + f'{request_body}: {type(e)}') + + +async def make_http_requests_async(request_ids: list, bug_map: dict, url: str, + batch_size: int, + payload_func: Callable) -> None: + """Makes the required number of HTTP requests per second async. + + Args: + request_ids: + A list of bug IDs + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + url: + The request URL + batch_size: + The number of requests to make per second + payload_func: + The payload function, such as `build_batch_payload` + """ + + begin_time = time.monotonic() + logging.info(f'[{begin_time}] Running make request {payload_func.__name__} ' + f'for {TOTAL_RUNTIME} seconds') + + total_run_time = time.monotonic() - begin_time + index = 0 + length = len(request_ids) + async with aiohttp.ClientSession() as session: + while total_run_time < TOTAL_RUNTIME: + start_time = time.monotonic() + + batch_request_ids = request_ids[index:batch_size + index] + if payload_func.__name__ == build_vulnerability_payload.__name__: + for request_id in batch_request_ids: + # OSV getting vulnerability detail is a GET request + asyncio.create_task( + make_http_request(session, f'{url}/{request_id}', 'GET', + payload_func())) + elif payload_func.__name__ == build_batch_payload.__name__: + for _ in range(0, batch_size): + asyncio.create_task( + make_http_request(session, url, 'POST', + payload_func(request_ids, bug_map))) + else: + for request_id in batch_request_ids: + asyncio.create_task( + make_http_request(session, url, 'POST', + payload_func(request_id, bug_map))) + index += batch_size + if index >= length: + index = 0 + + end_time = time.monotonic() + time_elapsed = end_time - start_time + if time_elapsed < FREQUENCY_IN_SECONDS: + await asyncio.sleep(FREQUENCY_IN_SECONDS - time_elapsed) + total_run_time = time.monotonic() - begin_time + + +def build_vulnerability_payload() -> None: + """The vulnerability query doesn't need a request body""" + return None + + +def build_package_payload(request_id: str, bug_map: dict) -> dict[str, any]: + """Builds a package query payload + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + + Returns: + A dict containing package query payload, example: + '"package": {"name": "mruby","ecosystem": "OSS-Fuzz"}}' + """ + + package = random.choice(bug_map[request_id].packages) + ecosystem = random.choice(bug_map[request_id].ecosystems) + return {"package": {"name": package, "ecosystem": ecosystem}} + + +def build_version_payload(request_id: str, bug_map: dict) -> dict: + """Builds a version query payload + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + + Returns: + A dict containing package version query payload, example: + '{"package": { + "name": "mruby","ecosystem": "OSS-Fuzz"}, "version": "2.1.2rc"}' + """ + package = random.choice(bug_map[request_id].packages) + ecosystem = random.choice(bug_map[request_id].ecosystems) + return { + "version": bug_map[request_id].affected_fuzzy, + "package": { + "name": package, + "ecosystem": ecosystem + } + } + + +def build_batch_payload(request_ids: list, + bug_map: dict) -> dict[str, list[dict[str, any]]]: + """Builds a batch query payload + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + + Returns: + A dict containing OSV batch query payload, example: + '{ + "queries": [ + { + "package": { + ... + }, + "version": ... + }, + { + "package": { + ... + }, + "version": ... + }, + ] + }' + """ + size = random.randint(1, 100) + batch_ids = random.sample(request_ids, min(size, len(request_ids))) + queries = [] + for bug_id in batch_ids: + query = {} + query_type = random.choice(['version', 'package']) + if query_type == 'version': + query = build_version_payload(bug_id, bug_map) + elif query_type == 'package': + query = build_package_payload(bug_id, bug_map) + queries.append(query) + + return {"queries": [queries]} + + +def get_large_batch_query(package_map: defaultdict) -> list[str]: + """Gets a list of bug IDs for large batch queries. + This list contains bug IDs from the packages with the high + number of vulnerabilities. + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + + Returns: + A dict containing OSV batch query payload, example: + '{ + "queries": [ + { + "package": { + ... + }, + "version": ... + }, + { + "package": { + ... + }, + "version": ... + }, + ] + }' + """ + most_common = 5000 + package_counter = Counter() + for package in package_map: + # filter out invalid package name and Linux Kernel + if package in ('foo', 'Kernel'): + continue + package_counter[package] = len(package_map[package]) + most_vulnerable_packages = package_counter.most_common(most_common) + large_batch_query_ids = [] + for package, package_count in most_vulnerable_packages: + if package_count == 0: + break + large_batch_query_ids.append(package_map[package].pop()) + + random.shuffle(large_batch_query_ids) + return large_batch_query_ids + + +async def send_version_requests(request_ids: list, bug_map: dict) -> None: + """Sends version query requests + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + """ + + url = f'{BASE_URL}/query' + batch_size = VERSION_QUERY_BATCH_SIZE + await make_http_requests_async(request_ids, bug_map, url, batch_size, + build_version_payload) + + +async def send_package_requests(request_ids: list, bug_map: dict) -> None: + """Sends package query requests + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + """ + url = f'{BASE_URL}/query' + batch_size = PACKAGE_QUERY_BATCH_SIZE + await make_http_requests_async(request_ids, bug_map, url, batch_size, + build_package_payload) + + +async def send_vuln_requests(request_ids: list, bug_map: dict) -> None: + """Sends vulnerability get requests + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + """ + url = f'{BASE_URL}/vulns' + batch_size = VULN_QUERY_BATCH_SIZE + await make_http_requests_async(request_ids, bug_map, url, batch_size, + build_vulnerability_payload) + + +async def send_batch_requests(request_ids: list, bug_map: dict, + batch_size: int) -> None: + """Sends batch query requests + + Args: + request_id: + The bug ID + bug_map: + A dict mapping bug IDs to the corresponding `SimpleBug` objects + batch_size: + The batch query size + """ + url = f'{BASE_URL}/querybatch' + await make_http_requests_async(request_ids, bug_map, url, batch_size, + build_batch_payload) + + +async def main() -> None: + osv.logs.setup_gcp_logging('staging-test') + seed = random.randrange(sys.maxsize) + # The seed value can be replaced for debugging + random.seed(seed) + logging.info(f'Random seed {seed}') + # The `ecosystem_map` can be used to filter our queries for a + # specific ecosystem. + ecosystem_map, bug_map, package_map = load_all_bugs() + vuln_query_ids = list(bug_map.keys()) + package_query_ids = [] + for package in package_map: + # Tests each package once. + package_query_ids.append(package_map[package].pop()) + random.shuffle(package_query_ids) + random.shuffle(vuln_query_ids) + logging.info(f'It will send vulnerability get requests for {len(vuln_query_ids)} ' + 'vulnerabilities.') + logging.info('It will send package/version/batch query requests for ' + f'{len(package_query_ids)} packages within ' + f'{len(ecosystem_map)} ecosystems.') + + # Get all packages with the most frequently occurring number + # of vulnerabilities. + large_batch_query_ids = get_large_batch_query(package_map) + + await asyncio.gather( + send_vuln_requests(vuln_query_ids, bug_map), + send_package_requests(package_query_ids, bug_map), + send_version_requests(package_query_ids, bug_map), + send_batch_requests(package_query_ids, bug_map, BATCH_QUERY_BATCH_SIZE), + send_batch_requests(large_batch_query_ids, bug_map, + LARGE_BATCH_QUERY_BATCH_SIZE)) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docker/staging_test/retrieve_bugs_from_db.py b/docker/staging_test/retrieve_bugs_from_db.py new file mode 100644 index 00000000000..b6ff5cd32b3 --- /dev/null +++ b/docker/staging_test/retrieve_bugs_from_db.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fetch Bugs from from datastore""" + +import logging +import os +import random +import json + +import osv +import osv.logs + +from google.cloud import ndb + +GCP_PROJECT = 'oss-vdb-test' +BUG_DIR = '/staging_testing/all_bugs' + +def format_bug_for_output(bug: osv.Bug) -> dict[str, any]: + """Outputs ndb bug query results to JSON file + + Args: + bug: an `osv.Bug` queried from ndb. + + Returns: + A dict storing all the important `Bug` fields that we want to use later + """ + + affected_fuzzy = None + # Store one version for use as the query version later. + if len(bug.affected_fuzzy) > 0: + version_index = random.randrange(len(bug.affected_fuzzy)) + affected_fuzzy = bug.affected_fuzzy[version_index] + + return { + 'db_id': bug.db_id, + 'purl': bug.purl, + 'project': bug.project, + 'ecosystem': bug.ecosystem, + 'affected_fuzzy': affected_fuzzy + } + + +def get_bugs_from_datastore() -> None: + """Gets all bugs from the datastore and writes to `BUG_DIR`.""" + + entries_per_file = 10000 # amount of bugs per file + batch_size = 1000 + file_counter = 0 + os.makedirs(BUG_DIR, exist_ok=True) + + def write_to_json(): + """Writes to a new JSON file.""" + file_name = f'{BUG_DIR}/all_bugs_{file_counter}.json' + with open(file_name, 'w+') as f: + json.dump(results, f, indent=2) + logging.info(f'Saved {total_entries} entries to {file_name}') + + with ndb.Client(project=GCP_PROJECT).context(): + query = osv.Bug.query() + query = query.filter(osv.Bug.status == osv.BugStatus.PROCESSED, + osv.Bug.public == True) # pylint: disable=singleton-comparison + logging.info(f'Querying {query}') + + results = [] + total_entries = 0 + next_cursor = None + + while True: + bugs, next_cursor, has_more = query.fetch_page( + page_size=batch_size, start_cursor=next_cursor) + if not has_more: + break + + logging.info(f'fetching {batch_size} entries.') + results.extend([format_bug_for_output(bug) for bug in bugs]) + total_entries += len(bugs) + + # Write bugs to separate files in case the query fails or times out. + if total_entries >= entries_per_file: + write_to_json() + + # Reset for the next file + results = [] + total_entries = 0 + file_counter += 1 + + # Write any remaining entries to the last file + if results: + write_to_json() + + logging.info(f'All results saved to {BUG_DIR}.') + +def main() -> None: + osv.logs.setup_gcp_logging('staging-test') + if not os.path.exists(BUG_DIR): + # This will take around 10 mins + get_bugs_from_datastore() + logging.info('Fetching data finished.') + else: + logging.info(f'{BUG_DIR} exists, skipping fetching.') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docker/staging_test/run.sh b/docker/staging_test/run.sh new file mode 100644 index 00000000000..ce95ad3d3e3 --- /dev/null +++ b/docker/staging_test/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash -x +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python3 /staging_testing/retrieve_bugs_from_db.py + +# Run the two processing scripts concurrently in the background +python3 /staging_testing/perform_api_calls.py & +python3 /staging_testing/perform_api_calls.py & + +# Wait for both background processes to finish +wait \ No newline at end of file From 0d34045ac51a0e8562d892530e53033d8629cce1 Mon Sep 17 00:00:00 2001 From: Holly Gong Date: Fri, 30 Aug 2024 15:43:12 +1000 Subject: [PATCH 3/6] fix lint --- .../oss-vdb-test/staging-test.yaml | 2 +- docker/staging_test/perform_api_calls.py | 24 ++++++++++--------- docker/staging_test/retrieve_bugs_from_db.py | 17 +++++++------ 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml index 59f9dca49cf..f7c0f3e3002 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml @@ -22,5 +22,5 @@ spec: memory: "3G" limits: cpu: 1 - memory: "4G" + memory: "6G" restartPolicy: Never \ No newline at end of file diff --git a/docker/staging_test/perform_api_calls.py b/docker/staging_test/perform_api_calls.py index f4e4a536212..9c5b4440802 100755 --- a/docker/staging_test/perform_api_calls.py +++ b/docker/staging_test/perform_api_calls.py @@ -148,7 +148,7 @@ async def make_http_request(session: aiohttp.ClientSession, request_url: str, The HTTP request body in JSON format """ try: - timeout = aiohttp.ClientTimeout(total=None, sock_connect=80, sock_read=80) + timeout = aiohttp.ClientTimeout(sock_connect=300, sock_read=300) if request_type == 'GET': async with session.get(request_url): pass # We're not awaiting the response, just sending the request @@ -157,8 +157,8 @@ async def make_http_request(session: aiohttp.ClientSession, request_url: str, # print(f'request: {request_body}, response: {response.status}') pass # We're not awaiting the response, just sending the request except Exception as e: - logging.warning(f'Error sending request {request_url} with body' - f'{request_body}: {type(e)}') + logging.warning('Error sending request %s with body %s: %s', request_url, + request_body, type(e)) async def make_http_requests_async(request_ids: list, bug_map: dict, url: str, @@ -180,8 +180,8 @@ async def make_http_requests_async(request_ids: list, bug_map: dict, url: str, """ begin_time = time.monotonic() - logging.info(f'[{begin_time}] Running make request {payload_func.__name__} ' - f'for {TOTAL_RUNTIME} seconds') + logging.info('[%s] Running make request %s for %d seconds', begin_time, + payload_func.__name__, TOTAL_RUNTIME) total_run_time = time.monotonic() - begin_time index = 0 @@ -427,7 +427,7 @@ async def main() -> None: seed = random.randrange(sys.maxsize) # The seed value can be replaced for debugging random.seed(seed) - logging.info(f'Random seed {seed}') + logging.info('Random seed %d', seed) # The `ecosystem_map` can be used to filter our queries for a # specific ecosystem. ecosystem_map, bug_map, package_map = load_all_bugs() @@ -438,11 +438,13 @@ async def main() -> None: package_query_ids.append(package_map[package].pop()) random.shuffle(package_query_ids) random.shuffle(vuln_query_ids) - logging.info(f'It will send vulnerability get requests for {len(vuln_query_ids)} ' - 'vulnerabilities.') - logging.info('It will send package/version/batch query requests for ' - f'{len(package_query_ids)} packages within ' - f'{len(ecosystem_map)} ecosystems.') + logging.info( + 'It will send vulnerability get requests for %d vulnerabilities.', + len(vuln_query_ids)) + logging.info( + 'It will send package/version/batch query requests for ' + '%d packages within %d ecosystems.', len(package_query_ids), + len(ecosystem_map)) # Get all packages with the most frequently occurring number # of vulnerabilities. diff --git a/docker/staging_test/retrieve_bugs_from_db.py b/docker/staging_test/retrieve_bugs_from_db.py index b6ff5cd32b3..1625659d67f 100644 --- a/docker/staging_test/retrieve_bugs_from_db.py +++ b/docker/staging_test/retrieve_bugs_from_db.py @@ -27,6 +27,7 @@ GCP_PROJECT = 'oss-vdb-test' BUG_DIR = '/staging_testing/all_bugs' + def format_bug_for_output(bug: osv.Bug) -> dict[str, any]: """Outputs ndb bug query results to JSON file @@ -65,13 +66,13 @@ def write_to_json(): file_name = f'{BUG_DIR}/all_bugs_{file_counter}.json' with open(file_name, 'w+') as f: json.dump(results, f, indent=2) - logging.info(f'Saved {total_entries} entries to {file_name}') + logging.info('Saved %d entries to %s', total_entries, file_name) with ndb.Client(project=GCP_PROJECT).context(): query = osv.Bug.query() query = query.filter(osv.Bug.status == osv.BugStatus.PROCESSED, osv.Bug.public == True) # pylint: disable=singleton-comparison - logging.info(f'Querying {query}') + logging.info('Querying %s', query) results = [] total_entries = 0 @@ -83,7 +84,7 @@ def write_to_json(): if not has_more: break - logging.info(f'fetching {batch_size} entries.') + logging.info('Fetching %d entries.', batch_size) results.extend([format_bug_for_output(bug) for bug in bugs]) total_entries += len(bugs) @@ -100,7 +101,8 @@ def write_to_json(): if results: write_to_json() - logging.info(f'All results saved to {BUG_DIR}.') + logging.info('All results saved to %s.', BUG_DIR) + def main() -> None: osv.logs.setup_gcp_logging('staging-test') @@ -109,7 +111,8 @@ def main() -> None: get_bugs_from_datastore() logging.info('Fetching data finished.') else: - logging.info(f'{BUG_DIR} exists, skipping fetching.') + logging.info('%s exists, skipping fetching.', BUG_DIR) + -if __name__ == "__main__": - main() \ No newline at end of file +if __name__ == '__main__': + main() From bb2f09e93f38dcc5da3582d89f09eb4e44484542 Mon Sep 17 00:00:00 2001 From: Holly Gong Date: Fri, 30 Aug 2024 16:02:32 +1000 Subject: [PATCH 4/6] update time --- .../gke-workers/environments/oss-vdb-test/staging-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml index f7c0f3e3002..59a18aa027f 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml @@ -3,7 +3,7 @@ kind: CronJob metadata: name: staging-test spec: - schedule: "50 * * * *" + schedule: "50 9 * * *" concurrencyPolicy: Forbid jobTemplate: spec: From 9d4154604d089f59cc45f4a15dd0eff34d01609b Mon Sep 17 00:00:00 2001 From: Holly Gong Date: Mon, 9 Sep 2024 14:11:18 +1000 Subject: [PATCH 5/6] rename cron job --- deployment/build-and-stage.yaml | 15 ++++++++------- .../environments/oss-vdb-test/kustomization.yaml | 2 +- .../{staging-test.yaml => staging-api-test.yaml} | 10 +++++----- .../{staging_test => staging_api_test}/Dockerfile | 12 ++++-------- .../{staging_test => staging_api_test}/build.sh | 8 ++++---- .../perform_api_calls.py | 7 +++++-- .../retrieve_bugs_from_db.py | 2 +- docker/{staging_test => staging_api_test}/run.sh | 10 ++++++---- 8 files changed, 34 insertions(+), 32 deletions(-) rename deployment/clouddeploy/gke-workers/environments/oss-vdb-test/{staging-test.yaml => staging-api-test.yaml} (76%) rename docker/{staging_test => staging_api_test}/Dockerfile (68%) rename docker/{staging_test => staging_api_test}/build.sh (70%) rename docker/{staging_test => staging_api_test}/perform_api_calls.py (98%) rename docker/{staging_test => staging_api_test}/retrieve_bugs_from_db.py (98%) rename docker/{staging_test => staging_api_test}/run.sh (71%) diff --git a/deployment/build-and-stage.yaml b/deployment/build-and-stage.yaml index f25c7198570..5e3a67236ab 100644 --- a/deployment/build-and-stage.yaml +++ b/deployment/build-and-stage.yaml @@ -98,14 +98,15 @@ steps: args: ['push', '--all-tags', 'gcr.io/oss-vdb/alias-computation'] waitFor: ['build-alias-computation', 'cloud-build-queue'] +# Build/push staging-api-test images to gcr.io/oss-vdb-test. - name: gcr.io/cloud-builders/docker - args: ['build', '-t', 'gcr.io/oss-vdb/staging-test:latest', '-t', 'gcr.io/oss-vdb/staging-test:$COMMIT_SHA', '.'] - dir: 'docker/staging_test' - id: 'build-staging-test' + args: ['build', '-t', 'gcr.io/oss-vdb-test/staging-api-test:latest', '-t', 'gcr.io/oss-vdb-test/staging-api-test:$COMMIT_SHA', '.'] + dir: 'docker/staging_api_test' + id: 'build-staging-api-test' waitFor: ['build-worker'] - name: gcr.io/cloud-builders/docker - args: ['push', '--all-tags', 'gcr.io/oss-vdb/staging-test'] - waitFor: ['build-staging-test', 'cloud-build-queue'] + args: ['push', '--all-tags', 'gcr.io/oss-vdb-test/staging-api-test'] + waitFor: ['build-staging-api-test', 'cloud-build-queue'] # Build/push cron job images. - name: gcr.io/cloud-builders/docker @@ -272,7 +273,7 @@ steps: importer=gcr.io/oss-vdb/importer:$COMMIT_SHA,\ exporter=gcr.io/oss-vdb/exporter:$COMMIT_SHA,\ alias-computation=gcr.io/oss-vdb/alias-computation:$COMMIT_SHA,\ - staging-test=gcr.io/oss-vdb/staging-test:$COMMIT_SHA,\ + staging-api-test=gcr.io/oss-vdb-test/staging-api-test:$COMMIT_SHA,\ cron=gcr.io/oss-vdb/cron:$COMMIT_SHA,\ debian-convert=gcr.io/oss-vdb/debian-convert:$COMMIT_SHA,\ combine-to-osv=gcr.io/oss-vdb/combine-to-osv:$COMMIT_SHA,\ @@ -324,7 +325,6 @@ images: - 'gcr.io/oss-vdb/importer:$COMMIT_SHA' - 'gcr.io/oss-vdb/exporter:$COMMIT_SHA' - 'gcr.io/oss-vdb/alias-computation:$COMMIT_SHA' -- 'gcr.io/oss-vdb/staging-test:$COMMIT_SHA' - 'gcr.io/oss-vdb/cron:$COMMIT_SHA' - 'gcr.io/oss-vdb/alpine-cve-convert:$COMMIT_SHA' - 'gcr.io/oss-vdb/debian-cve-convert:$COMMIT_SHA' @@ -336,3 +336,4 @@ images: - 'gcr.io/oss-vdb/cpe-repo-gen:$COMMIT_SHA' - 'gcr.io/oss-vdb/nvd-cve-osv:$COMMIT_SHA' - 'gcr.io/oss-vdb/nvd-mirror:$COMMIT_SHA' +- 'gcr.io/oss-vdb-test/staging-api-test:$COMMIT_SHA' diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml index eb4b6019790..747e6a9f7c4 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml @@ -1,6 +1,6 @@ resources: - ../../base -- staging-test.yaml +- staging-api-test.yaml patches: - path: workers.yaml - path: scaler.yaml diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-api-test.yaml similarity index 76% rename from deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml rename to deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-api-test.yaml index 59a18aa027f..4756a72e506 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-test.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-api-test.yaml @@ -1,7 +1,7 @@ apiVersion: batch/v1 kind: CronJob metadata: - name: staging-test + name: staging-api-test spec: schedule: "50 9 * * *" concurrencyPolicy: Forbid @@ -10,17 +10,17 @@ spec: template: spec: containers: - - name: staging-test - image: staging-test + - name: staging-api-test + image: staging-api-test imagePullPolicy: Always env: - name: GOOGLE_CLOUD_PROJECT value: "oss-vdb-test" resources: requests: - cpu: 1 + cpu: 1.5 memory: "3G" limits: - cpu: 1 + cpu: 2 memory: "6G" restartPolicy: Never \ No newline at end of file diff --git a/docker/staging_test/Dockerfile b/docker/staging_api_test/Dockerfile similarity index 68% rename from docker/staging_test/Dockerfile rename to docker/staging_api_test/Dockerfile index a2cb54b9f5c..af19e653ccc 100644 --- a/docker/staging_test/Dockerfile +++ b/docker/staging_api_test/Dockerfile @@ -14,17 +14,13 @@ FROM gcr.io/oss-vdb/worker -WORKDIR /staging_testing +WORKDIR /staging_api_test -COPY retrieve_bugs_from_db.py /staging_testing -COPY perform_api_calls.py /staging_testing -COPY run.sh /staging_testing +COPY retrieve_bugs_from_db.py perform_api_calls.py run.sh ./ # Add aiohttp lib RUN cd /env/docker/worker && POETRY_VIRTUALENVS_CREATE=false poetry add aiohttp -RUN chmod 755 /staging_testing/retrieve_bugs_from_db.py -RUN chmod 755 /staging_testing/perform_api_calls.py -RUN chmod 755 /staging_testing/run.sh +RUN chmod 755 retrieve_bugs_from_db.py perform_api_calls.py run.sh -ENTRYPOINT ["/staging_testing/run.sh"] +ENTRYPOINT ["./run.sh"] diff --git a/docker/staging_test/build.sh b/docker/staging_api_test/build.sh similarity index 70% rename from docker/staging_test/build.sh rename to docker/staging_api_test/build.sh index 8421fa3f69b..261f31a0d74 100755 --- a/docker/staging_test/build.sh +++ b/docker/staging_api_test/build.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -docker build -t gcr.io/oss-vdb/staging-test:$1 . && \ -docker build -t gcr.io/oss-vdb/staging-test:latest . && \ -docker push gcr.io/oss-vdb/staging-test:$1 && \ -docker push gcr.io/oss-vdb/staging-test:latest +docker build -t gcr.io/oss-vdb-test/staging-api-test:$1 . && \ +docker build -t gcr.io/oss-vdb-test/staging-api-test:latest . && \ +docker push gcr.io/oss-vdb-test/staging-api-test:$1 && \ +docker push gcr.io/oss-vdb-test/staging-api-test:latest diff --git a/docker/staging_test/perform_api_calls.py b/docker/staging_api_test/perform_api_calls.py similarity index 98% rename from docker/staging_test/perform_api_calls.py rename to docker/staging_api_test/perform_api_calls.py index 9c5b4440802..62777ecdde1 100755 --- a/docker/staging_test/perform_api_calls.py +++ b/docker/staging_api_test/perform_api_calls.py @@ -33,7 +33,7 @@ BASE_URL = 'https://api.test.osv.dev/v1' GCP_PROJECT = 'oss-vdb-test' -BUG_DIR = '/staging_testing/all_bugs' +BUG_DIR = './all_bugs' # Total run time in seconds TOTAL_RUNTIME = 3600 @@ -154,9 +154,12 @@ async def make_http_request(session: aiohttp.ClientSession, request_url: str, pass # We're not awaiting the response, just sending the request elif request_type == 'POST': async with session.post(request_url, json=request_body, timeout=timeout): - # print(f'request: {request_body}, response: {response.status}') pass # We're not awaiting the response, just sending the request except Exception as e: + # When sending a large number of requests concurrently, + # some may fail due to timeout issues. + # These failures can be ignored as long as the server receives a + # sufficient volume of successful requests. logging.warning('Error sending request %s with body %s: %s', request_url, request_body, type(e)) diff --git a/docker/staging_test/retrieve_bugs_from_db.py b/docker/staging_api_test/retrieve_bugs_from_db.py similarity index 98% rename from docker/staging_test/retrieve_bugs_from_db.py rename to docker/staging_api_test/retrieve_bugs_from_db.py index 1625659d67f..f4d1541d0f8 100644 --- a/docker/staging_test/retrieve_bugs_from_db.py +++ b/docker/staging_api_test/retrieve_bugs_from_db.py @@ -25,7 +25,7 @@ from google.cloud import ndb GCP_PROJECT = 'oss-vdb-test' -BUG_DIR = '/staging_testing/all_bugs' +BUG_DIR = './all_bugs' def format_bug_for_output(bug: osv.Bug) -> dict[str, any]: diff --git a/docker/staging_test/run.sh b/docker/staging_api_test/run.sh similarity index 71% rename from docker/staging_test/run.sh rename to docker/staging_api_test/run.sh index ce95ad3d3e3..5130c7ad915 100644 --- a/docker/staging_test/run.sh +++ b/docker/staging_api_test/run.sh @@ -13,11 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -python3 /staging_testing/retrieve_bugs_from_db.py +python3 ./retrieve_bugs_from_db.py -# Run the two processing scripts concurrently in the background -python3 /staging_testing/perform_api_calls.py & -python3 /staging_testing/perform_api_calls.py & +# `aiohttp` has limits on the number of simultaneous connections. +# Running two instances of the program in parrallel +# can help circumvent this restriction. +python3 ./perform_api_calls.py & +python3 ./perform_api_calls.py & # Wait for both background processes to finish wait \ No newline at end of file From a2cf07661f649e73475c124118e8cdc334280c1d Mon Sep 17 00:00:00 2001 From: Holly Gong Date: Mon, 9 Sep 2024 14:29:17 +1000 Subject: [PATCH 6/6] increase memeory --- .../environments/oss-vdb-test/staging-api-test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-api-test.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-api-test.yaml index 4756a72e506..b44dd7fec6c 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-api-test.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/staging-api-test.yaml @@ -19,8 +19,8 @@ spec: resources: requests: cpu: 1.5 - memory: "3G" + memory: "4G" limits: cpu: 2 - memory: "6G" + memory: "10G" restartPolicy: Never \ No newline at end of file