Skip to content

Commit

Permalink
Fix a bug that causes premature termination of processing (#2283)
Browse files Browse the repository at this point in the history
It turns out that trying to reimport a subsequently non-existent record
(i.e. when the originating CVE was rejected), the source file no longer
exists in GCS and the retrieval failure raises an exception, which is
being silently caught, which then caused the transaction to terminate.

So don't silently catch exceptions, other than the general one used to
terminate in dry-run mode.

Also removed unnecessary copy-pasta for this program's use case (it's
hard-limited to 30 records at a time due to Datastore query limits, so
batching makes no sense).
  • Loading branch information
andrewpollock authored Jun 6, 2024
1 parent 8be8626 commit ccd8495
Showing 1 changed file with 27 additions and 22 deletions.
49 changes: 27 additions & 22 deletions tools/datafix/reimport_gcs_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@

from google.cloud import datastore
from google.cloud import storage
from google.cloud.exceptions import NotFound
from google.cloud.storage import retry
from google.cloud.datastore.query import PropertyFilter

import argparse
import os
import functools

MAX_BATCH_SIZE = 500
MAX_QUERY_SIZE = 30


Expand Down Expand Up @@ -168,29 +168,34 @@ def main() -> None:
result_to_fix = [r for r in result if r['source_of_truth'] == 2]
print(f"There are {len(result_to_fix)} bugs to operate on...")

# Chunk the results to modify in acceptibly sized batches for the API.
for batch in range(0, len(result_to_fix), MAX_BATCH_SIZE):
try:
with ds_client.transaction() as xact:
for bug in result_to_fix[batch:batch + MAX_BATCH_SIZE]:
bug_in_gcs = objname_for_bug(ds_client, bug)
if args.verbose:
print(f"Resetting creation time for {bug_in_gcs['uri']}")
if not args.dryrun:
try:
with ds_client.transaction() as xact:
for bug in result_to_fix:
bug_in_gcs = objname_for_bug(ds_client, bug)
if args.verbose:
print(f"Resetting creation time for {bug_in_gcs['uri']}")
if not args.dryrun:
try:
reset_object_creation(bug_in_gcs["bucket"], bug_in_gcs["path"],
args.tmpdir)
bug["import_last_modified"] = None
if args.verbose:
print(f"Resetting import_last_modified for {bug['db_id']}")
print(f"Review at {url_base}{bug['db_id']} when reimport completes.")
xact.put(bug)
if args.dryrun:
raise Exception("Dry run mode. Preventing transaction from commiting") # pylint: disable=broad-exception-raised
except Exception as e:
# Don't have the first batch's transaction-aborting exception stop
# subsequent batches from being attempted.
if args.dryrun and e.args[0].startswith("Dry run mode"):
pass
except NotFound as e:
if args.verbose:
print(f"Skipping, got {e}\n")
continue
bug["import_last_modified"] = None
if args.verbose:
print(f"Resetting import_last_modified for {bug['db_id']}")
print(f"Review at {url_base}{bug['db_id']} when reimport completes.")
xact.put(bug)
if args.dryrun:
raise Exception("Dry run mode. Preventing transaction from commiting") # pylint: disable=broad-exception-raised
except Exception as e:
# Don't have the first batch's transaction-aborting exception stop
# subsequent batches from being attempted.
if args.dryrun and e.args[0].startswith("Dry run mode"):
pass
else:
raise


if __name__ == "__main__":
Expand Down

0 comments on commit ccd8495

Please sign in to comment.