From f8bb23aa27955609bc68b2f7066958fda0391a59 Mon Sep 17 00:00:00 2001 From: Andrew Pollock Date: Wed, 7 Aug 2024 05:18:14 +0000 Subject: [PATCH 1/2] Add durable support for overriding the bucket This is a more durable solution to the edge case we were trying to correct from, where the Bug in Datastore was from a GIT source (with a different path to the source file) than what we wanted to trigger reimporting for, and so various sanity checks and assumptions fail. This adds support for overriding the bucket name (and optionally, the path within the bucket), which is normally self-determined from the existing record. ``` pipenv run python tools/datafix/reimport_gcs_record.py --verbose --no-dry-run --project oss-vdb --bucket cve-osv-conversion:osv-output CVE-2022-2816 ``` --- tools/datafix/reimport_gcs_record.py | 36 ++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/tools/datafix/reimport_gcs_record.py b/tools/datafix/reimport_gcs_record.py index 4ce7233dbc5..4abb6c7921a 100755 --- a/tools/datafix/reimport_gcs_record.py +++ b/tools/datafix/reimport_gcs_record.py @@ -31,23 +31,41 @@ class UnexpectedSituation(Exception): pass -def objname_for_bug(client: datastore.Client, - bug: datastore.entity.Entity) -> dict: +def objname_for_bug(client: datastore.Client, bug: datastore.entity.Entity, + forced_bucket_name: str) -> dict: """Returns the GCS object details for a given Bug. Args: client: an initialized Cloud Datastore client. bug: a Bug Cloud Datastore entity. + forced_bucket_name: bucket name (with optional colon-separated path) to + forcibly use. Returns: A dict with keys for the GCS uri, the bucket name and path within the bucket. """ + source_object_path = bug["source_id"].split(":")[1] + + if forced_bucket_name: + (bucket, _, bucketpath) = forced_bucket_name.partition(":") + # The assumption is that when passed a different bucket path, only the + # current object's base filename is relevant. + return { + "uri": + "gs://" + os.path.join(bucket, bucketpath, + os.path.basename(source_object_path)), + "bucket": + bucket, + "path": + os.path.join(bucketpath, os.path.basename(source_object_path)) + } + bucket = bucket_for_source(client, bug["source"]) return { - "uri": "gs://" + os.path.join(bucket, bug["source_id"].split(":")[1]), + "uri": "gs://" + os.path.join(bucket, source_object_path), "bucket": bucket, - "path": bug["source_id"].split(":")[1] + "path": source_object_path } @@ -150,6 +168,13 @@ def main() -> None: dest="tmpdir", default="/tmp", help="Local directory to copy to from GCS") + parser.add_argument( + "--bucket", + action="store", + dest="bucket", + default=None, + help=("Override the bucket name (and with a colon + path, the path) " + "for the object in GCS")) args = parser.parse_args() if len(args.bugs[0]) > MAX_QUERY_SIZE: @@ -172,7 +197,8 @@ def main() -> None: with ds_client.transaction() as xact: for bug in result_to_fix: try: - bug_in_gcs = objname_for_bug(ds_client, bug) + bug_in_gcs = objname_for_bug( + ds_client, bug, forced_bucket_name=args.bucket) except UnexpectedSituation as e: if args.verbose: print(f"Skipping {bug['db_id']}, got {e}\n") From a03a80e5a7f2d40c6c17d8059def0824741dce5a Mon Sep 17 00:00:00 2001 From: Andrew Pollock Date: Tue, 13 Aug 2024 00:35:26 +0000 Subject: [PATCH 2/2] fix: add an example to help text Adds an example to describe the more complex way of overriding the bucket. --- tools/datafix/reimport_gcs_record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/datafix/reimport_gcs_record.py b/tools/datafix/reimport_gcs_record.py index 4abb6c7921a..defa67c0572 100755 --- a/tools/datafix/reimport_gcs_record.py +++ b/tools/datafix/reimport_gcs_record.py @@ -174,7 +174,7 @@ def main() -> None: dest="bucket", default=None, help=("Override the bucket name (and with a colon + path, the path) " - "for the object in GCS")) + "for the object in GCS (e.g. `cve-osv-conversion:osv-output`)")) args = parser.parse_args() if len(args.bugs[0]) > MAX_QUERY_SIZE: