Skip to content

Commit

Permalink
v0.1.4: bump cachew version,drop py3.7 support
Browse files Browse the repository at this point in the history
also removes the requirement to have a `.return_type`
  • Loading branch information
purarue authored Sep 30, 2023
2 parents d7c250b + 17b2a23 commit 1584cb8
Show file tree
Hide file tree
Showing 15 changed files with 183 additions and 148 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@ name: CI

on:
push:
branches: [master]
branches: ['*']
pull_request:
branches: [master]
branches: ['*']

jobs:
build:
strategy:
matrix:
platform: [ubuntu-latest, windows-latest]
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.8, 3.9, "3.10", "3.11"]
exclude: [
{platform: windows-latest, python-version: "3.8"},
{platform: windows-latest, python-version: "3.9"}
{platform: windows-latest, python-version: "3.9"},
{platform: windows-latest, python-version: "3.10"}
]

runs-on: ${{ matrix.platform }}
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ This was extracted out of [my HPI](https://github.com/seanbreckenridge/HPI/tree/

## Installation

Requires `python3.7+`
Requires `python3.8+`

To install with pip, run:

Expand Down Expand Up @@ -174,7 +174,6 @@ Just to give a brief overview, to add new functionality (parsing some new folder

- Add a `model` for it in [`models.py`](google_takeout_parser/models.py) subclassing `BaseEvent` and adding it to the Union at the bottom of the file. That should have a `key` property function which describes each event uniquely (used to merge takeout events)
- Write a function which takes the `Path` to the file you're trying to parse and converts it to the model you created (See examples in [`parse_json.py`](google_takeout_parser/parse_json.py)). Ideally extract a single raw item from the takeout file add a test for it so its obvious when/if the format changes.
- Set [the `return_type`](https://github.com/seanbreckenridge/google_takeout_parser/blob/7b1ee8ec3c3f36e6f279f20a9a214b6a3e8775f5/google_takeout_parser/parse_json.py#L71) property on the function, to use for caching/filtering
- Add a regex match for the file path to the [`DEFAULT_HANDLER_MAP`](https://github.com/seanbreckenridge/google_takeout_parser/blob/2bd64b7373e4a2ac2ace32e03b25ca3b7e901034/google_takeout_parser/path_dispatch.py#L48)

### Testing
Expand Down
14 changes: 5 additions & 9 deletions google_takeout_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
from pkg_resources import get_distribution, DistributionNotFound
import importlib.metadata

try:
# Change here if project is renamed and does not equal the package name
dist_name = __name__
__version__ = get_distribution(dist_name).version
except DistributionNotFound:
__version__ = "unknown"
finally:
del get_distribution, DistributionNotFound
# Change here if project is renamed and does not equal the package name
__version__ = importlib.metadata.version(__name__)

del importlib
4 changes: 2 additions & 2 deletions google_takeout_parser/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ def merge(cache: bool, action: str, takeout_dir: Sequence[str]) -> None:
"""
from .path_dispatch import TakeoutParser
from .merge import cached_merge_takeouts, merge_events
from .models import DEFAULT_MODEL_TYPE
from .models import DEFAULT_MODEL_TYPE, Res

res: List[DEFAULT_MODEL_TYPE] = []
res: List[Res[DEFAULT_MODEL_TYPE]] = []
if cache:
res = list(cached_merge_takeouts(list(takeout_dir)))
else:
Expand Down
8 changes: 0 additions & 8 deletions google_takeout_parser/compat.py

This file was deleted.

4 changes: 2 additions & 2 deletions google_takeout_parser/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@

# Note: only used for this module, HPI caches elsewhere
@cachew(
cache_path=lambda _: str(takeout_cache_path / "_merged_takeouts"),
depends_on=lambda pths: list(sorted([str(p) for p in pths])),
cache_path=str(takeout_cache_path / "_merged_takeouts"),
depends_on=lambda tp: str(list(sorted(str(p) for p in tp))),
force_file=True,
logger=logger,
)
Expand Down
14 changes: 3 additions & 11 deletions google_takeout_parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from __future__ import annotations
from datetime import datetime
from typing import Optional, List, Tuple, Any, Union, Iterator, TYPE_CHECKING, Dict
from typing import Optional, List, Tuple, Any, Union, Iterator, Dict, Protocol
from dataclasses import dataclass

from .common import Res
Expand All @@ -26,14 +26,6 @@
# name, url
Subtitles = Tuple[str, MetaData]

if TYPE_CHECKING:
try:
from typing import Protocol
except ImportError:
from typing_extensions import Protocol # type: ignore
else:
Protocol = object


class BaseEvent(Protocol):
@property
Expand Down Expand Up @@ -107,11 +99,11 @@ def key(self) -> int:
class Location(BaseEvent):
lat: float
lng: float
accuracy: Optional[int]
accuracy: Optional[float]
dt: datetime

@property
def key(self) -> Tuple[float, float, Optional[int], int]:
def key(self) -> Tuple[float, float, Optional[float], int]:
return self.lat, self.lng, self.accuracy, int(self.dt.timestamp())


Expand Down
3 changes: 0 additions & 3 deletions google_takeout_parser/parse_html/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,3 @@ def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
yield _parse_activity_div(outer_div, file_dt=file_dt)
except Exception as ae:
yield ae


_parse_html_activity.return_type = Activity # type: ignore[attr-defined]
3 changes: 0 additions & 3 deletions google_takeout_parser/parse_html/comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ def _parse_html_comment_file(p: Path) -> Iterator[Res[YoutubeComment]]:
yield e


_parse_html_comment_file.return_type = YoutubeComment # type: ignore[attr-defined]


def test_parse_html_comment_file() -> None:
li_obj = bs4.BeautifulSoup(
"""<ul><li>Sent at 2020-04-27 23:18:23 UTC while watching <a href="http://www.youtube.com/watch?v=mM">a video</a>.<br/>content here</li></ul>""",
Expand Down
19 changes: 1 addition & 18 deletions google_takeout_parser/parse_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,6 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
yield e


_parse_json_activity.return_type = Activity # type: ignore[attr-defined]


def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]:
json_data = json.loads(p.read_text())
if not isinstance(json_data, list):
Expand All @@ -91,9 +88,6 @@ def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]:
yield e


_parse_likes.return_type = LikedYoutubeVideo # type: ignore[attr-defined]


def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]:
json_data = json.loads(p.read_text())
if not isinstance(json_data, list):
Expand All @@ -109,9 +103,6 @@ def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]:
yield e


_parse_app_installs.return_type = PlayStoreAppInstall # type: ignore[attr-defined]


def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime:
if f"{key}Ms" in d:
return parse_datetime_millis(d[f"{key}Ms"])
Expand All @@ -137,14 +128,12 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]:
lng=float(loc["longitudeE7"]) / 1e7,
lat=float(loc["latitudeE7"]) / 1e7,
dt=_parse_location_timestamp(loc),
accuracy=None if accuracy is None else int(accuracy),
accuracy=None if accuracy is None else float(accuracy),
)
except Exception as e:
yield e


_parse_location_history.return_type = Location # type: ignore[attr-defined]

_sem_required_keys = ["location", "duration"]


Expand Down Expand Up @@ -209,9 +198,6 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]:
yield e


_parse_semantic_location_history.return_type = PlaceVisit # type: ignore[attr-defined]


def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]:
json_data = json.loads(p.read_text())
if "Browser History" not in json_data:
Expand All @@ -226,6 +212,3 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]:
)
except Exception as e:
yield e


_parse_chrome_history.return_type = ChromeHistory # type: ignore[attr-defined]
89 changes: 65 additions & 24 deletions google_takeout_parser/path_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@
from typing import (
Iterator,
Dict,
Union,
Callable,
Any,
Optional,
List,
Type,
Tuple,
cast,
Literal,
)

from collections import defaultdict

from cachew import cachew

from . import __version__ as _google_takeout_version
from .compat import Literal
from .common import Res, PathIsh
from .cache import takeout_cache_path
from .log import logger
Expand All @@ -47,22 +47,68 @@
HandlerFunction = Callable[[Path], BaseResults]
HandlerMap = Dict[str, Optional[HandlerFunction]]

_CacheKeySingle = Type[BaseEvent]
CacheKey = _CacheKeySingle
CacheKey = Tuple[Type[BaseEvent], ...]


def _cache_key_to_str(c: CacheKey) -> str:
return str(c.__name__).casefold()
"""Convert a cache key to a string"""
return "_".join(sorted(p.__name__ for p in c)).casefold()


def _parse_handler_return_type(handler: HandlerFunction) -> CacheKey:
assert hasattr(
handler, "return_type"
), f"Handler functions should have an 'return_type' property which specifies what types this produces. See parse_json.py for an example. No 'return_type' on {handler}"
val: Any = getattr(handler, "return_type")
assert isinstance(val, type), f"{val} is not a type"
assert BaseEvent in val.__mro__, f"{val} not a subclass of BaseEvent"
return cast(_CacheKeySingle, val)
def _handler_type_cache_key(handler: HandlerFunction) -> CacheKey:
# Take a function like Iterator[Union[Item, Exception]] and return Item

import inspect
from cachew.legacy import get_union_args

sig = inspect.signature(handler)

# get the return type of the function
# e.g. Iterator[Union[Item, Exception]]
return_type = sig.return_annotation

# this must have a return type
if return_type == inspect.Signature.empty:
raise TypeError(f"Could not get return type for {handler.__name__}")

# remove top-level iterator if it has it
if return_type._name == "Iterator":
return_type = return_type.__args__[0]

args: Optional[Tuple[Type]] = get_union_args(return_type) # type: ignore[type-arg]
if args is None:
raise TypeError(
f"Could not get union args for {return_type} in {handler.__name__}"
)

# remove exceptions
t_args = tuple(t for t in args if t != Exception)

for t in t_args:
if BaseEvent not in t.__mro__:
raise TypeError(
f"Return type {t} from {return_type} of {handler.__name__} does not contain BaseEvent"
)
if t == BaseEvent:
raise TypeError(
f"Return type {t} from {return_type} of {handler.__name__} is BaseEvent, which is not allowed"
)

return tuple(t_args)


def _cache_key_to_type(c: CacheKey) -> Any:
"""
If theres one item in the cache key, return that
If theres multiple, return a Union of them
"""
assert len(c) > 0
if len(c) == 1:
return c[0]
else:
assert isinstance(c, tuple)

return Union[c] # type: ignore[valid-type]


# If parsed, should mention:
Expand Down Expand Up @@ -285,7 +331,7 @@ def _log_handler(self, path: Path, handler: Any) -> None:
def _parse_raw(self, filter_type: Optional[Type[BaseEvent]] = None) -> BaseResults:
"""Parse the takeout with no cache. If a filter is specified, only parses those files"""
handlers = self._group_by_return_type(filter_type=filter_type)
for cache_key, result_tuples in handlers.items():
for _, result_tuples in handlers.items():
for path, itr in result_tuples:
self._log_handler(path, itr)
yield from itr
Expand Down Expand Up @@ -339,9 +385,9 @@ def _group_by_return_type(
"""
handlers: Dict[CacheKey, List[Tuple[Path, BaseResults]]] = defaultdict(list)
for path, handler in self.dispatch_map().items():
ckey: CacheKey = _parse_handler_return_type(handler)
ckey: CacheKey = _handler_type_cache_key(handler)
# don't include in the result if we're filtering to a specific type
if filter_type is not None and ckey != filter_type:
if filter_type is not None and filter_type not in ckey:
logger.debug(
f"Provided '{filter_type}' as filter, '{ckey}' doesn't match, ignoring '{path}'..."
)
Expand Down Expand Up @@ -381,14 +427,9 @@ def _cached_parse(
) -> BaseResults:
handlers = self._group_by_return_type(filter_type=filter_type)
for cache_key, result_tuples in handlers.items():
# Hmm -- I think this should work with CacheKeys that have multiple
# types but it may fail -- need to check if one is added
#
# create a function which groups the iterators for this return type
# that all gets stored in one database
#
# the return type here is purely for cachew, so it can infer the type
def _func() -> Iterator[Res[cache_key]]: # type: ignore[valid-type]
_ret_type: Any = _cache_key_to_type(cache_key)

def _func() -> Iterator[Res[_ret_type]]: # type: ignore[valid-type]
for path, itr in result_tuples:
self._log_handler(path, itr)
yield from itr
Expand Down
Loading

0 comments on commit 1584cb8

Please sign in to comment.