v0.1.4: bump cachew version,drop py3.7 support

also removes the requirement to have a `.return_type`
purarue · Sep 30, 2023 · 1584cb8 · 1584cb8
2 parents d7c250b + 17b2a23
commit 1584cb8
Show file tree

Hide file tree

Showing 15 changed files with 183 additions and 148 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -2,19 +2,19 @@ name: CI
 
 on:
   push:
-    branches: [master]
+    branches: ['*']
   pull_request:
-    branches: [master]
+    branches: ['*']
 
 jobs:
   build:
     strategy:
       matrix:
         platform: [ubuntu-latest, windows-latest]
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.8, 3.9, "3.10", "3.11"]
         exclude: [
-          {platform: windows-latest, python-version: "3.8"},
-          {platform: windows-latest, python-version: "3.9"}
+          {platform: windows-latest, python-version: "3.9"},
+          {platform: windows-latest, python-version: "3.10"}
         ]
 
     runs-on: ${{ matrix.platform }}

diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ This was extracted out of [my HPI](https://github.com/seanbreckenridge/HPI/tree/
 
 ## Installation
 
-Requires `python3.7+`
+Requires `python3.8+`
 
 To install with pip, run:
 
@@ -174,7 +174,6 @@ Just to give a brief overview, to add new functionality (parsing some new folder
 
 - Add a `model` for it in [`models.py`](google_takeout_parser/models.py) subclassing `BaseEvent` and adding it to the Union at the bottom of the file. That should have a `key` property function which describes each event uniquely (used to merge takeout events)
 - Write a function which takes the `Path` to the file you're trying to parse and converts it to the model you created (See examples in [`parse_json.py`](google_takeout_parser/parse_json.py)). Ideally extract a single raw item from the takeout file add a test for it so its obvious when/if the format changes.
-- Set [the `return_type`](https://github.com/seanbreckenridge/google_takeout_parser/blob/7b1ee8ec3c3f36e6f279f20a9a214b6a3e8775f5/google_takeout_parser/parse_json.py#L71) property on the function, to use for caching/filtering
 - Add a regex match for the file path to the [`DEFAULT_HANDLER_MAP`](https://github.com/seanbreckenridge/google_takeout_parser/blob/2bd64b7373e4a2ac2ace32e03b25ca3b7e901034/google_takeout_parser/path_dispatch.py#L48)
 
 ### Testing

diff --git a/google_takeout_parser/__init__.py b/google_takeout_parser/__init__.py
@@ -1,10 +1,6 @@
-from pkg_resources import get_distribution, DistributionNotFound
+import importlib.metadata
 
-try:
-    # Change here if project is renamed and does not equal the package name
-    dist_name = __name__
-    __version__ = get_distribution(dist_name).version
-except DistributionNotFound:
-    __version__ = "unknown"
-finally:
-    del get_distribution, DistributionNotFound
+# Change here if project is renamed and does not equal the package name
+__version__ = importlib.metadata.version(__name__)
+
+del importlib
diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py
@@ -106,9 +106,9 @@ def merge(cache: bool, action: str, takeout_dir: Sequence[str]) -> None:
     """
     from .path_dispatch import TakeoutParser
     from .merge import cached_merge_takeouts, merge_events
-    from .models import DEFAULT_MODEL_TYPE
+    from .models import DEFAULT_MODEL_TYPE, Res
 
-    res: List[DEFAULT_MODEL_TYPE] = []
+    res: List[Res[DEFAULT_MODEL_TYPE]] = []
     if cache:
         res = list(cached_merge_takeouts(list(takeout_dir)))
     else:

diff --git a/google_takeout_parser/compat.py b/google_takeout_parser/compat.py
diff --git a/google_takeout_parser/merge.py b/google_takeout_parser/merge.py
@@ -23,8 +23,8 @@
 
 # Note: only used for this module, HPI caches elsewhere
 @cachew(
-    cache_path=lambda _: str(takeout_cache_path / "_merged_takeouts"),
-    depends_on=lambda pths: list(sorted([str(p) for p in pths])),
+    cache_path=str(takeout_cache_path / "_merged_takeouts"),
+    depends_on=lambda tp: str(list(sorted(str(p) for p in tp))),
     force_file=True,
     logger=logger,
 )

diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py
@@ -7,7 +7,7 @@
 
 from __future__ import annotations
 from datetime import datetime
-from typing import Optional, List, Tuple, Any, Union, Iterator, TYPE_CHECKING, Dict
+from typing import Optional, List, Tuple, Any, Union, Iterator, Dict, Protocol
 from dataclasses import dataclass
 
 from .common import Res
@@ -26,14 +26,6 @@
 # name, url
 Subtitles = Tuple[str, MetaData]
 
-if TYPE_CHECKING:
-    try:
-        from typing import Protocol
-    except ImportError:
-        from typing_extensions import Protocol  # type: ignore
-else:
-    Protocol = object
-
 
 class BaseEvent(Protocol):
     @property
@@ -107,11 +99,11 @@ def key(self) -> int:
 class Location(BaseEvent):
     lat: float
     lng: float
-    accuracy: Optional[int]
+    accuracy: Optional[float]
     dt: datetime
 
     @property
-    def key(self) -> Tuple[float, float, Optional[int], int]:
+    def key(self) -> Tuple[float, float, Optional[float], int]:
         return self.lat, self.lng, self.accuracy, int(self.dt.timestamp())
 
 

diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py
@@ -337,6 +337,3 @@ def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
             yield _parse_activity_div(outer_div, file_dt=file_dt)
         except Exception as ae:
             yield ae
-
-
-_parse_html_activity.return_type = Activity  # type: ignore[attr-defined]
diff --git a/google_takeout_parser/parse_html/comment.py b/google_takeout_parser/parse_html/comment.py
@@ -60,9 +60,6 @@ def _parse_html_comment_file(p: Path) -> Iterator[Res[YoutubeComment]]:
             yield e
 
 
-_parse_html_comment_file.return_type = YoutubeComment  # type: ignore[attr-defined]
-
-
 def test_parse_html_comment_file() -> None:
     li_obj = bs4.BeautifulSoup(
         """<ul><li>Sent at 2020-04-27 23:18:23 UTC while watching <a href="http://www.youtube.com/watch?v=mM">a video</a>.<br/>content here</li></ul>""",

diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py
@@ -70,9 +70,6 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
             yield e
 
 
-_parse_json_activity.return_type = Activity  # type: ignore[attr-defined]
-
-
 def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]:
     json_data = json.loads(p.read_text())
     if not isinstance(json_data, list):
@@ -91,9 +88,6 @@ def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]:
             yield e
 
 
-_parse_likes.return_type = LikedYoutubeVideo  # type: ignore[attr-defined]
-
-
 def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]:
     json_data = json.loads(p.read_text())
     if not isinstance(json_data, list):
@@ -109,9 +103,6 @@ def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]:
             yield e
 
 
-_parse_app_installs.return_type = PlayStoreAppInstall  # type: ignore[attr-defined]
-
-
 def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime:
     if f"{key}Ms" in d:
         return parse_datetime_millis(d[f"{key}Ms"])
@@ -137,14 +128,12 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]:
                 lng=float(loc["longitudeE7"]) / 1e7,
                 lat=float(loc["latitudeE7"]) / 1e7,
                 dt=_parse_location_timestamp(loc),
-                accuracy=None if accuracy is None else int(accuracy),
+                accuracy=None if accuracy is None else float(accuracy),
             )
         except Exception as e:
             yield e
 
 
-_parse_location_history.return_type = Location  # type: ignore[attr-defined]
-
 _sem_required_keys = ["location", "duration"]
 
 
@@ -209,9 +198,6 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]:
                 yield e
 
 
-_parse_semantic_location_history.return_type = PlaceVisit  # type: ignore[attr-defined]
-
-
 def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]:
     json_data = json.loads(p.read_text())
     if "Browser History" not in json_data:
@@ -226,6 +212,3 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]:
             )
         except Exception as e:
             yield e
-
-
-_parse_chrome_history.return_type = ChromeHistory  # type: ignore[attr-defined]
diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py
@@ -9,21 +9,21 @@
 from typing import (
     Iterator,
     Dict,
+    Union,
     Callable,
     Any,
     Optional,
     List,
     Type,
     Tuple,
-    cast,
+    Literal,
 )
 
 from collections import defaultdict
 
 from cachew import cachew
 
 from . import __version__ as _google_takeout_version
-from .compat import Literal
 from .common import Res, PathIsh
 from .cache import takeout_cache_path
 from .log import logger
@@ -47,22 +47,68 @@
 HandlerFunction = Callable[[Path], BaseResults]
 HandlerMap = Dict[str, Optional[HandlerFunction]]
 
-_CacheKeySingle = Type[BaseEvent]
-CacheKey = _CacheKeySingle
+CacheKey = Tuple[Type[BaseEvent], ...]
 
 
 def _cache_key_to_str(c: CacheKey) -> str:
-    return str(c.__name__).casefold()
+    """Convert a cache key to a string"""
+    return "_".join(sorted(p.__name__ for p in c)).casefold()
 
 
-def _parse_handler_return_type(handler: HandlerFunction) -> CacheKey:
-    assert hasattr(
-        handler, "return_type"
-    ), f"Handler functions should have an 'return_type' property which specifies what types this produces. See parse_json.py for an example. No 'return_type' on {handler}"
-    val: Any = getattr(handler, "return_type")
-    assert isinstance(val, type), f"{val} is not  a type"
-    assert BaseEvent in val.__mro__, f"{val} not a subclass of BaseEvent"
-    return cast(_CacheKeySingle, val)
+def _handler_type_cache_key(handler: HandlerFunction) -> CacheKey:
+    # Take a function like Iterator[Union[Item, Exception]] and return Item
+
+    import inspect
+    from cachew.legacy import get_union_args
+
+    sig = inspect.signature(handler)
+
+    # get the return type of the function
+    # e.g. Iterator[Union[Item, Exception]]
+    return_type = sig.return_annotation
+
+    # this must have a return type
+    if return_type == inspect.Signature.empty:
+        raise TypeError(f"Could not get return type for {handler.__name__}")
+
+    # remove top-level iterator if it has it
+    if return_type._name == "Iterator":
+        return_type = return_type.__args__[0]
+
+    args: Optional[Tuple[Type]] = get_union_args(return_type)  # type: ignore[type-arg]
+    if args is None:
+        raise TypeError(
+            f"Could not get union args for {return_type} in {handler.__name__}"
+        )
+
+    # remove exceptions
+    t_args = tuple(t for t in args if t != Exception)
+
+    for t in t_args:
+        if BaseEvent not in t.__mro__:
+            raise TypeError(
+                f"Return type {t} from {return_type} of {handler.__name__} does not contain BaseEvent"
+            )
+        if t == BaseEvent:
+            raise TypeError(
+                f"Return type {t} from {return_type} of {handler.__name__} is BaseEvent, which is not allowed"
+            )
+
+    return tuple(t_args)
+
+
+def _cache_key_to_type(c: CacheKey) -> Any:
+    """
+    If theres one item in the cache key, return that
+    If theres multiple, return a Union of them
+    """
+    assert len(c) > 0
+    if len(c) == 1:
+        return c[0]
+    else:
+        assert isinstance(c, tuple)
+
+        return Union[c]  # type: ignore[valid-type]
 
 
 # If parsed, should mention:
@@ -285,7 +331,7 @@ def _log_handler(self, path: Path, handler: Any) -> None:
     def _parse_raw(self, filter_type: Optional[Type[BaseEvent]] = None) -> BaseResults:
         """Parse the takeout with no cache. If a filter is specified, only parses those files"""
         handlers = self._group_by_return_type(filter_type=filter_type)
-        for cache_key, result_tuples in handlers.items():
+        for _, result_tuples in handlers.items():
             for path, itr in result_tuples:
                 self._log_handler(path, itr)
                 yield from itr
@@ -339,9 +385,9 @@ def _group_by_return_type(
         """
         handlers: Dict[CacheKey, List[Tuple[Path, BaseResults]]] = defaultdict(list)
         for path, handler in self.dispatch_map().items():
-            ckey: CacheKey = _parse_handler_return_type(handler)
+            ckey: CacheKey = _handler_type_cache_key(handler)
             # don't include in the result if we're filtering to a specific type
-            if filter_type is not None and ckey != filter_type:
+            if filter_type is not None and filter_type not in ckey:
                 logger.debug(
                     f"Provided '{filter_type}' as filter, '{ckey}' doesn't match, ignoring '{path}'..."
                 )
@@ -381,14 +427,9 @@ def _cached_parse(
     ) -> BaseResults:
         handlers = self._group_by_return_type(filter_type=filter_type)
         for cache_key, result_tuples in handlers.items():
-            # Hmm -- I think this should work with CacheKeys that have multiple
-            # types but it may fail -- need to check if one is added
-            #
-            # create a function which groups the iterators for this return type
-            # that all gets stored in one database
-            #
-            # the return type here is purely for cachew, so it can infer the type
-            def _func() -> Iterator[Res[cache_key]]:  # type: ignore[valid-type]
+            _ret_type: Any = _cache_key_to_type(cache_key)
+
+            def _func() -> Iterator[Res[_ret_type]]:  # type: ignore[valid-type]
                 for path, itr in result_tuples:
                     self._log_handler(path, itr)
                     yield from itr