fix: Add docstring, fix issue with BadArgumentError (#2506)

Co-authored-by: Andrew Pollock <[email protected]>
google · Aug 23, 2024 · 57c03cb · 57c03cb
1 parent 773a0c7
commit 57c03cb
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 11 deletions.
diff --git a/docs/api/post-v1-query.md b/docs/api/post-v1-query.md
@@ -25,12 +25,12 @@ To query multiple packages at once, see further information [here](post-v1-query
 ## Parameters
 
 |---
-| Parameter         | Type   | Description                                                                                                                                                    |
-| ----------------- | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `commit`          | string | The commit hash to query for. If specified, `version` should not be set.                                                                                       |
-| `version`         | string | The version string to query for. A fuzzy match is done against upstream versions. If specified, `commit` should not be set.                                    |
-| `package`         | object | The package to query against. When a `commit` hash is given, this is optional.                                                                                 |
-| `page_token`      | string | If your previous query fetched a large number of results, the response will be paginated. This is an optional field. Please see the [pagination section](#pagination) for more information. |
+| Parameter    | Type   | Description                                                                                                                                                                                 |
+| ------------ | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `commit`     | string | The commit hash to query for. If specified, `version` should not be set.                                                                                                                    |
+| `version`    | string | The version string to query for. A fuzzy match is done against upstream versions. If specified, `commit` should not be set.                                                                 |
+| `package`    | object | The package to query against. When a `commit` hash is given, this is optional.                                                                                                              |
+| `page_token` | string | If your previous query fetched a large number of results, the response will be paginated. This is an optional field. Please see the [pagination section](#pagination) for more information. |
 
 Package Objects can be described by package name AND ecosystem OR by the package URL. 
 
@@ -130,7 +130,7 @@ curl -d \
 
 The OSV.dev API uses pagination for queries that return a large number of vulnerabilities. When pagination is used, the `next_page_token` is given in the response, indicating that there are more results to return. You will need to run additional queries using the `page_token` to see the remaining results, repeating queries until the `next_page_token` is no longer included in the response. 
 
-For the `v1/query` endpoint pagination will occur when there more than 1,000 vulnerabilities in the response.  The page size can vary slightly because of threading and  may change in the future.
+For the `v1/query` endpoint pagination will occur when there more than 1,000 vulnerabilities in the response, or when the query has exceeded 20 seconds. The page size can vary slightly because of threading and may change in the future.
 
 A response indicating pagination will be in this form:
 ```json
@@ -151,3 +151,5 @@ curl -d \
 
 ```
 
+{: .note }
+In rare cases, the response might contain **only** the `next_page_token`. In those cases, there might be more data that can be retrieved, but were not found within the time limit, please keep querying with the `next_page_token` until either results are returned, or no more page tokens are returned. 
diff --git a/gcp/api/server.py b/gcp/api/server.py
@@ -285,6 +285,7 @@ def QueryAffectedBatch(self, request, context: grpc.ServicerContext):
           logging.warning(e)
           context.abort(grpc.StatusCode.INVALID_ARGUMENT,
                         f'Invalid page token at index: {i}.')
+
       query_context = QueryContext(
           service_context=context,
           request_cutoff_time=req_cutoff_time,
@@ -403,13 +404,25 @@ def page_limit(self) -> int:
 
 @dataclass
 class QueryContext:
+  """
+  Information about the query the server is currently
+  responding to.
+  """
   service_context: grpc.ServicerContext
   page_token: ndb.Cursor | None
   request_cutoff_time: datetime
   # Use a dataclass to copy by reference
   total_responses: ResponsesCount
 
   def should_break_page(self, response_count: int):
+    """
+    Returns whether the API should finish its current page here 
+    and return a cursor.
+
+    Currently uses two criteria:
+      - total response size greater than page limit
+      - request exceeding the cutoff time
+    """
     return (response_count >= self.total_responses.page_limit() or
             datetime.now() > self.request_cutoff_time)
 
@@ -964,7 +977,10 @@ def _query_by_generic_version(
                                                   ecosystem, purl, version,
                                                   False)
 
-  if results:
+  # If no results is because of a page break, then there is no reason
+  # to pass further down as page break would still be in effect with
+  # the following queries, and would have to immediately return.
+  if results or context.should_break_page(0):
     return results, cursor
 
   # page_token can be the token for this query, or the token for the one
@@ -978,7 +994,7 @@ def _query_by_generic_version(
                                                   osv.normalize_tag(version),
                                                   True)
 
-  if results:
+  if results or context.should_break_page(0):
     return results, cursor
 
   # Try again after canonicalizing + normalizing version.
@@ -995,12 +1011,22 @@ def query_by_generic_helper(results: list, cursor, context: QueryContext,
                             base_query: ndb.Query, project: str, ecosystem: str,
                             purl: PackageURL | None, version: str,
                             is_normalized):
-  """Helper function for query_by_generic."""
+  """
+  Helper function for query_by_generic. 
+  This function can be called multiple times.
+  """
   query: ndb.Query = base_query.filter(osv.Bug.affected_fuzzy == version)
   it: ndb.QueryIterator = query.iter(start_cursor=context.page_token)
   while (yield it.has_next_async()):
     if context.should_break_page(len(results)):
-      cursor = it.cursor_after()
+      # Because this helper function might be called multiple times
+      # we might break before the first result is even queried, raising
+      # a BadArgumentError
+      try:
+        cursor = it.cursor_after()
+      except ndb_exceptions.BadArgumentError:
+        # Don't set the cursor in this case and just return existing cursor
+        pass
       break
     bug = it.next()
     if _is_version_affected(