fix: Preset API pagination

betodealmeida · Sep 10, 2024 · cafac4e · cafac4e
1 parent 7ecfc4c
commit cafac4e
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 51 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,6 +5,9 @@ Changelog
 Next
 ====
 
+- Removes pagination from Preset API and adds note to documentation (#469)
+- Add ``UPGRADE()`` function to upgrade the library (#468)
+
 Version 1.2.26 - 2024-08-08
 ===========================
 

diff --git a/docs/development.rst b/docs/development.rst
@@ -296,6 +296,8 @@ If an adapter declares support for ``LIMIT`` and ``OFFSET`` a corresponding para
 
 Now the adapter can handle ``limit`` and ``offset``, reducing the amount of data that is returned. Note that even if the adapter declares supporting ``LIMIT``, SQLite will still enforce the limit, ie, if for any reason the adapter returns more rows than the limit SQLite will fix the problem. The same is not true for the offset.
 
+Note that an adapter can only support ``limit`` and ``offset`` if it has filters on all the columns. Otherwise it would potentially return only a subset of the data, since after returning ``limit`` rows the data could be filtered further by SQLite.
+
 Returning only the requested columns
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/src/shillelagh/adapters/api/generic_json.py b/src/shillelagh/adapters/api/generic_json.py
@@ -134,8 +134,6 @@ def get_data(  # pylint: disable=unused-argument, too-many-arguments
         self,
         bounds: Dict[str, Filter],
         order: List[Tuple[str, RequestedOrder]],
-        limit: Optional[int] = None,
-        offset: Optional[int] = None,
         requested_columns: Optional[Set[str]] = None,
         **kwargs: Any,
     ) -> Iterator[Row]:

diff --git a/src/shillelagh/adapters/api/generic_xml.py b/src/shillelagh/adapters/api/generic_xml.py
@@ -60,8 +60,6 @@ def get_data(  # pylint: disable=unused-argument, too-many-arguments
         self,
         bounds: Dict[str, Filter],
         order: List[Tuple[str, RequestedOrder]],
-        limit: Optional[int] = None,
-        offset: Optional[int] = None,
         requested_columns: Optional[Set[str]] = None,
         **kwargs: Any,
     ) -> Iterator[Row]:

diff --git a/src/shillelagh/adapters/api/preset.py b/src/shillelagh/adapters/api/preset.py
@@ -4,6 +4,7 @@
 This is a derivation of the generic JSON adapter that handles Preset auth.
 """
 
+import itertools
 import logging
 import re
 from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, cast
@@ -81,55 +82,36 @@ def __init__(  # pylint: disable=too-many-arguments
         )
 
 
-def get_urls(
-    resource_url: str,
-    offset: Optional[int] = None,
-    limit: Optional[int] = None,
-    page_size: int = MAX_PAGE_SIZE,
-) -> Iterator[Tuple[str, slice]]:
+def get_urls(resource_url: str, page_size: int = MAX_PAGE_SIZE) -> Iterator[str]:
     """
-    Get all paginated URLs to download data together with a limit/offset slice.
+    Get all URLs to download data.
     """
-    start = offset or 0
-    stop = start + limit if limit is not None else None
-
     baseurl = URL(resource_url)
     query = baseurl.query.get("q", "()")
     try:
         params = prison.loads(query)
     except Exception:  # pylint: disable=broad-except
-        yield str(baseurl), slice(start, stop)
+        yield str(baseurl)
         return
 
     # assume the user knows better and keep the URL unmodified
     if "page" in params or "page_size" in params:
-        yield str(baseurl), slice(start, stop)
+        yield str(baseurl)
         return
 
-    page = start // page_size
-    start = start % page_size
-    remaining = limit if limit is not None else float("inf")
+    params["page_size"] = page_size
+    page = 0
     while True:
         params["page"] = page
-        params["page_size"] = min(start + remaining, page_size)
-        yield str(baseurl.with_query({"q": prison.dumps(params)})), slice(start, None)
-
-        remaining -= page_size - start
-        if remaining <= 0:
-            break
-
+        yield str(baseurl.with_query({"q": prison.dumps(params)}))
         page += 1
-        start = 0
 
 
 class PresetWorkspaceAPI(PresetAPI):
     """
     Adapter for Preset workspaces.
     """
 
-    supports_limit = True
-    supports_offset = True
-
     default_path = "$.result[*]"
     cache_name = "preset_cache"
 
@@ -143,10 +125,16 @@ def supports(cls, uri: str, fast: bool = True, **kwargs: Any) -> Optional[bool]:
         )
 
     def _set_columns(self) -> None:
-        # request only a single page of results to infer schema
-        rows = list(self.get_data({}, [], limit=MAX_PAGE_SIZE))
-        column_names = list(rows[0].keys()) if rows else []
+        rows = self.get_data({}, [])
+
+        try:
+            row = next(rows)
+            rows = itertools.chain([row], rows)
+            column_names = list(row.keys())
+        except StopIteration:
+            column_names = []
 
+        rows = itertools.islice(rows, MAX_PAGE_SIZE)
         _, order, types = analyze(iter(rows))
 
         self.columns = {
@@ -163,12 +151,10 @@ def get_data(  # pylint: disable=unused-argument, too-many-arguments, too-many-l
         self,
         bounds: Dict[str, Filter],
         order: List[Tuple[str, RequestedOrder]],
-        limit: Optional[int] = None,
-        offset: Optional[int] = None,
         requested_columns: Optional[Set[str]] = None,
         **kwargs: Any,
     ) -> Iterator[Row]:
-        for url, slice_ in get_urls(self.uri, offset, limit, MAX_PAGE_SIZE):
+        for url in get_urls(self.uri, MAX_PAGE_SIZE):
             response = self._session.get(str(url))
             payload = response.json()
             if not response.ok:
@@ -178,7 +164,7 @@ def get_data(  # pylint: disable=unused-argument, too-many-arguments, too-many-l
                 )
                 raise ProgrammingError(f"Error: {messages}")
 
-            rows = jsonpath.findall(self.path, payload)[slice_]
+            rows = jsonpath.findall(self.path, payload)
             if not rows:
                 break
 

diff --git a/tests/adapters/api/preset_test.py b/tests/adapters/api/preset_test.py
@@ -99,25 +99,19 @@ def test_get_urls() -> None:
     """
     gen = get_urls(
         "https://abcdef01.us1a.app-sdx.preset.io/api/v1/chart/",
-        offset=45,
-        limit=50,
         page_size=42,
     )
 
-    url, slice_ = next(gen)
+    url = next(gen)
     assert (
         url
-        == "https://abcdef01.us1a.app-sdx.preset.io/api/v1/chart/?q=(page:1,page_size:42)"
+        == "https://abcdef01.us1a.app-sdx.preset.io/api/v1/chart/?q=(page:0,page_size:42)"
     )
-    assert slice_.start == 3
-    url, slice_ = next(gen)
+    url = next(gen)
     assert (
         url
-        == "https://abcdef01.us1a.app-sdx.preset.io/api/v1/chart/?q=(page:2,page_size:11)"
+        == "https://abcdef01.us1a.app-sdx.preset.io/api/v1/chart/?q=(page:1,page_size:42)"
     )
-    assert slice_.start == 0
-    with pytest.raises(StopIteration):
-        next(gen)
 
 
 def test_get_urls_unable_to_parse() -> None:
@@ -126,7 +120,7 @@ def test_get_urls_unable_to_parse() -> None:
     """
 
     gen = get_urls("https://example.org/?q=(((")
-    assert next(gen)[0] == "https://example.org/?q=((("
+    assert next(gen) == "https://example.org/?q=((("
     with pytest.raises(StopIteration):
         next(gen)
 
@@ -137,7 +131,7 @@ def test_get_urls_with_page_parameters() -> None:
     """
 
     gen = get_urls("https://example.org/?q=(page:0,page_size:42)")
-    assert next(gen)[0] == "https://example.org/?q=(page:0,page_size:42)"
+    assert next(gen) == "https://example.org/?q=(page:0,page_size:42)"
     with pytest.raises(StopIteration):
         next(gen)
 
@@ -197,13 +191,17 @@ def test_preset_workspace_pagination(requests_mock: Mocker) -> None:
         },
     )
     requests_mock.get(
-        "https://abcdef01.us1a.app.preset.io/api/v1/chart/?q=(page:1,page_size:3)",
+        "https://abcdef01.us1a.app.preset.io/api/v1/chart/?q=(page:1,page_size:100)",
         json={
             "result": [
-                {"id": i + 101, "slice_name": f"Team {i+101}"} for i in range(3)
+                {"id": i + 101, "slice_name": f"Team {i+101}"} for i in range(50)
             ],
         },
     )
+    requests_mock.get(
+        "https://abcdef01.us1a.app.preset.io/api/v1/chart/?q=(page:2,page_size:100)",
+        json={"result": []},
+    )
 
     connection = connect(
         ":memory:",