Skip to content

Commit

Permalink
Add durable support for overriding the bucket
Browse files Browse the repository at this point in the history
This is a more durable solution to the edge case we were trying to
correct from, where the Bug in Datastore was from a GIT source (with a
different path to the source file) than what we wanted to trigger
reimporting for, and so various sanity checks and assumptions fail.

This adds support for overriding the bucket name (and optionally, the
path within the bucket), which is normally self-determined from the
existing record.

```
pipenv run python tools/datafix/reimport_gcs_record.py --verbose --no-dry-run --project oss-vdb --bucket cve-osv-conversion:osv-output CVE-2022-2816
```
  • Loading branch information
andrewpollock committed Aug 7, 2024
1 parent baa360d commit f8bb23a
Showing 1 changed file with 31 additions and 5 deletions.
36 changes: 31 additions & 5 deletions tools/datafix/reimport_gcs_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,41 @@ class UnexpectedSituation(Exception):
pass


def objname_for_bug(client: datastore.Client,
bug: datastore.entity.Entity) -> dict:
def objname_for_bug(client: datastore.Client, bug: datastore.entity.Entity,
forced_bucket_name: str) -> dict:
"""Returns the GCS object details for a given Bug.
Args:
client: an initialized Cloud Datastore client.
bug: a Bug Cloud Datastore entity.
forced_bucket_name: bucket name (with optional colon-separated path) to
forcibly use.
Returns:
A dict with keys for the GCS uri, the bucket name and path within the
bucket.
"""
source_object_path = bug["source_id"].split(":")[1]

if forced_bucket_name:
(bucket, _, bucketpath) = forced_bucket_name.partition(":")
# The assumption is that when passed a different bucket path, only the
# current object's base filename is relevant.
return {
"uri":
"gs://" + os.path.join(bucket, bucketpath,
os.path.basename(source_object_path)),
"bucket":
bucket,
"path":
os.path.join(bucketpath, os.path.basename(source_object_path))
}

bucket = bucket_for_source(client, bug["source"])
return {
"uri": "gs://" + os.path.join(bucket, bug["source_id"].split(":")[1]),
"uri": "gs://" + os.path.join(bucket, source_object_path),
"bucket": bucket,
"path": bug["source_id"].split(":")[1]
"path": source_object_path
}


Expand Down Expand Up @@ -150,6 +168,13 @@ def main() -> None:
dest="tmpdir",
default="/tmp",
help="Local directory to copy to from GCS")
parser.add_argument(
"--bucket",
action="store",
dest="bucket",
default=None,
help=("Override the bucket name (and with a colon + path, the path) "
"for the object in GCS"))
args = parser.parse_args()

if len(args.bugs[0]) > MAX_QUERY_SIZE:
Expand All @@ -172,7 +197,8 @@ def main() -> None:
with ds_client.transaction() as xact:
for bug in result_to_fix:
try:
bug_in_gcs = objname_for_bug(ds_client, bug)
bug_in_gcs = objname_for_bug(
ds_client, bug, forced_bucket_name=args.bucket)
except UnexpectedSituation as e:
if args.verbose:
print(f"Skipping {bug['db_id']}, got {e}\n")
Expand Down

0 comments on commit f8bb23a

Please sign in to comment.