From 42a55140532a732dc34abff940a206179baba71f Mon Sep 17 00:00:00 2001 From: Thibault Derousseaux <6574550+tibdex@users.noreply.github.com> Date: Tue, 4 Mar 2025 15:15:08 -0500 Subject: [PATCH] Add skeleton (#292) --- .github/workflows/test.yml | 1 + Dockerfile | 5 +- app/__init__.py | 2 +- app/constants.py | 55 ------ app/create_and_join_tables.py | 54 +++--- app/create_cubes.py | 79 ++++----- app/load_tables.py | 36 ++-- app/skeleton.py | 128 ++++++++++++++ app/util/__init__.py | 1 + app/util/skeleton/__init__.py | 20 +++ app/util/skeleton/_node.py | 193 ++++++++++++++++++++++ app/util/skeleton/column.py | 9 + app/util/skeleton/contributors_count.py | 1 + app/util/skeleton/fact_based_hierarchy.py | 24 +++ app/util/skeleton/skeleton.py | 132 +++++++++++++++ pyproject.toml | 6 +- tests/docker/test_docker.py | 9 +- tests/test_session.py | 26 +-- uv.lock | 2 + 19 files changed, 618 insertions(+), 165 deletions(-) delete mode 100644 app/constants.py create mode 100644 app/skeleton.py create mode 100644 app/util/skeleton/__init__.py create mode 100644 app/util/skeleton/_node.py create mode 100644 app/util/skeleton/column.py create mode 100644 app/util/skeleton/contributors_count.py create mode 100644 app/util/skeleton/fact_based_hierarchy.py create mode 100644 app/util/skeleton/skeleton.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6e28673..84ae15b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,6 +12,7 @@ jobs: - uses: astral-sh/setup-uv@v3 with: enable-cache: true + # Keep in sync with `Dockerfile`'s `builder`. version: "0.5.6" - run: uv python install 3.10 - run: uv sync --locked diff --git a/Dockerfile b/Dockerfile index 1a35640..5470b54 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ # Inspired from https://github.com/astral-sh/uv-docker-example/blob/dee88a8c43be3b16b0ad58f0daee5eaee7e2157a/multistage.Dockerfile. -FROM ghcr.io/astral-sh/uv:0.4.10-python3.10-bookworm-slim AS builder +# Keep in sync with `.github/workflows/test.yml`. +FROM ghcr.io/astral-sh/uv:0.5.6-python3.10-bookworm-slim AS builder ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy @@ -25,4 +26,4 @@ ENV PORT=80 EXPOSE $PORT -CMD ["python", "-u", "-m", "app"] +CMD ["python", "-O", "-u", "-m", "app"] diff --git a/app/__init__.py b/app/__init__.py index 362b5f3..13c9190 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,3 +1,3 @@ from .config import Config as Config -from .constants import * # noqa: F403 +from .skeleton import SKELETON as SKELETON from .start_app import start_app as start_app diff --git a/app/constants.py b/app/constants.py deleted file mode 100644 index 8f6278b..0000000 --- a/app/constants.py +++ /dev/null @@ -1,55 +0,0 @@ -from enum import Enum - - -class Table(Enum): - STATION_DETAILS = "Station details" - STATION_STATUS = "Station status" - - -class StationDetailsTableColumn(Enum): - ID = "ID" - NAME = "Name" - DEPARTMENT = "Department" - CITY = "City" - POSTCODE = "Postcode" - STREET = "Street" - HOUSE_NUMBER = "House number" - CAPACITY = "Capacity" - - -class StationStatusTableColumn(Enum): - STATION_ID = "Station ID" - BIKE_TYPE = "Bike type" - BIKES = "Bikes" - - -class Cube(Enum): - STATION = "Station" - - -class StationCubeHierarchy(Enum): - BIKE_TYPE = StationStatusTableColumn.BIKE_TYPE.value - LOCATION = "Location" - STATION = "Station" - - -class StationCubeBikeTypeLevel(Enum): - BIKE_TYPE = StationCubeHierarchy.BIKE_TYPE.value - - -class StationCubeLocationLevel(Enum): - DEPARTMENT = StationDetailsTableColumn.DEPARTMENT.value - CITY = StationDetailsTableColumn.CITY.value - POSTCODE = StationDetailsTableColumn.POSTCODE.value - STREET = StationDetailsTableColumn.STREET.value - HOUSE_NUMBER = StationDetailsTableColumn.HOUSE_NUMBER.value - - -class StationCubeStationLevel(Enum): - NAME = StationDetailsTableColumn.NAME.value - ID = StationDetailsTableColumn.ID.value - - -class StationCubeMeasure(Enum): - CAPACITY = StationDetailsTableColumn.CAPACITY.value - BIKES = StationStatusTableColumn.BIKES.value diff --git a/app/create_and_join_tables.py b/app/create_and_join_tables.py index 2d8746d..6fed3b2 100644 --- a/app/create_and_join_tables.py +++ b/app/create_and_join_tables.py @@ -1,52 +1,54 @@ import atoti as tt -from .constants import StationDetailsTableColumn, StationStatusTableColumn, Table +from .skeleton import SKELETON +from .util.skeleton import column def create_station_status_table(session: tt.Session, /) -> None: + skeleton = SKELETON.tables.STATION_STATUS + columns = skeleton.columns session.create_table( - Table.STATION_STATUS.value, + skeleton.name, data_types={ - StationStatusTableColumn.STATION_ID.value: tt.LONG, - StationStatusTableColumn.BIKE_TYPE.value: tt.STRING, - StationStatusTableColumn.BIKES.value: tt.INT, + columns.STATION_ID.name: tt.LONG, + columns.BIKE_TYPE.name: tt.STRING, + columns.BIKES.name: tt.INT, }, keys={ - StationStatusTableColumn.STATION_ID.value, - StationStatusTableColumn.BIKE_TYPE.value, + columns.STATION_ID.name, + columns.BIKE_TYPE.name, }, ) def create_station_details_table(session: tt.Session, /) -> None: + skeleton = SKELETON.tables.STATION_DETAILS + columns = skeleton.columns session.create_table( - Table.STATION_DETAILS.value, + skeleton.name, data_types={ - StationDetailsTableColumn.ID.value: tt.LONG, - StationDetailsTableColumn.NAME.value: tt.STRING, - StationDetailsTableColumn.DEPARTMENT.value: tt.STRING, - StationDetailsTableColumn.CITY.value: tt.STRING, - StationDetailsTableColumn.POSTCODE.value: tt.INT, - StationDetailsTableColumn.STREET.value: tt.STRING, - StationDetailsTableColumn.HOUSE_NUMBER.value: tt.STRING, - StationDetailsTableColumn.CAPACITY.value: tt.INT, + columns.ID.name: tt.LONG, + columns.NAME.name: tt.STRING, + columns.DEPARTMENT.name: tt.STRING, + columns.CITY.name: tt.STRING, + columns.POSTCODE.name: tt.INT, + columns.STREET.name: tt.STRING, + columns.HOUSE_NUMBER.name: tt.STRING, + columns.CAPACITY.name: tt.INT, }, - default_values={StationDetailsTableColumn.POSTCODE.value: 0}, + default_values={columns.POSTCODE.name: 0}, keys={ - StationDetailsTableColumn.ID.value, + columns.ID.name, }, ) def join_tables(session: tt.Session, /) -> None: - session.tables[Table.STATION_STATUS.value].join( - session.tables[Table.STATION_DETAILS.value], - session.tables[Table.STATION_STATUS.value][ - StationStatusTableColumn.STATION_ID.value - ] - == session.tables[Table.STATION_DETAILS.value][ - StationDetailsTableColumn.ID.value - ], + tables = SKELETON.tables + session.tables[tables.STATION_STATUS.key].join( + session.tables[tables.STATION_DETAILS.key], + column(session, tables.STATION_STATUS.columns.STATION_ID) + == column(session, tables.STATION_DETAILS.columns.ID), ) diff --git a/app/create_cubes.py b/app/create_cubes.py index c16650e..e041d2f 100644 --- a/app/create_cubes.py +++ b/app/create_cubes.py @@ -1,69 +1,48 @@ import atoti as tt -from .constants import ( - Cube, - StationCubeBikeTypeLevel, - StationCubeHierarchy, - StationCubeLocationLevel, - StationCubeMeasure, - StationCubeStationLevel, - StationDetailsTableColumn, - StationStatusTableColumn, - Table, -) +from .skeleton import SKELETON +from .util.skeleton import column, fact_based_hierarchy def create_station_cube(session: tt.Session, /) -> None: - station_details_table = session.tables[Table.STATION_DETAILS.value] - station_status_table = session.tables[Table.STATION_STATUS.value] + tables = SKELETON.tables + skeleton = SKELETON.cubes.STATION - cube = session.create_cube(station_status_table, Cube.STATION.value, mode="manual") + cube = session.create_cube( + session.tables[tables.STATION_STATUS.key], + skeleton.name, + mode="manual", + ) h, l, m = cube.hierarchies, cube.levels, cube.measures h.update( - { - StationCubeHierarchy.BIKE_TYPE.value: { - StationCubeBikeTypeLevel.BIKE_TYPE.value: station_status_table[ - StationStatusTableColumn.BIKE_TYPE.value + dict( + [ + fact_based_hierarchy(session, hierarchy) + for hierarchy in [ + skeleton.dimensions.STATION_STATUS.hierarchies.BIKE_TYPE, + skeleton.dimensions.STATION_DETAILS.hierarchies.LOCATION, + skeleton.dimensions.STATION_DETAILS.hierarchies.STATION, ] - }, - StationCubeHierarchy.LOCATION.value: { - StationCubeLocationLevel.DEPARTMENT.value: station_details_table[ - StationDetailsTableColumn.DEPARTMENT.value - ], - StationCubeLocationLevel.CITY.value: station_details_table[ - StationDetailsTableColumn.CITY.value - ], - StationCubeLocationLevel.POSTCODE.value: station_details_table[ - StationDetailsTableColumn.POSTCODE.value - ], - StationCubeLocationLevel.STREET.value: station_details_table[ - StationDetailsTableColumn.STREET.value - ], - StationCubeLocationLevel.HOUSE_NUMBER.value: station_details_table[ - StationDetailsTableColumn.HOUSE_NUMBER.value - ], - }, - StationCubeHierarchy.STATION.value: { - StationCubeStationLevel.NAME.value: station_details_table[ - StationDetailsTableColumn.NAME.value - ], - StationCubeStationLevel.ID.value: station_status_table[ - StationStatusTableColumn.STATION_ID.value - ], - }, - } + ] + ) ) with session.data_model_transaction(): - m[StationCubeMeasure.BIKES.value] = tt.agg.sum( - station_status_table[StationStatusTableColumn.BIKES.value] + m[skeleton.measures.BIKES.key] = tt.agg.sum( + column(session, tables.STATION_STATUS.columns.BIKES) ) - m[StationCubeMeasure.CAPACITY.value] = tt.agg.sum( + m[skeleton.measures.CAPACITY.key] = tt.agg.sum( tt.agg.single_value( - station_details_table[StationDetailsTableColumn.CAPACITY.value] + column(session, tables.STATION_DETAILS.columns.CAPACITY) + ), + scope=tt.OriginScope( + { + l[ + skeleton.dimensions.STATION_DETAILS.hierarchies.STATION.levels.ID.key + ] + } ), - scope=tt.OriginScope({l[StationCubeStationLevel.ID.value]}), ) diff --git a/app/load_tables.py b/app/load_tables.py index 0bc3793..c1db43b 100644 --- a/app/load_tables.py +++ b/app/load_tables.py @@ -9,7 +9,7 @@ from pydantic import HttpUrl from .config import Config -from .constants import StationDetailsTableColumn, StationStatusTableColumn, Table +from .skeleton import SKELETON from .util import read_json, reverse_geocode @@ -19,6 +19,7 @@ async def read_station_details( reverse_geocoding_path: HttpUrl | Path, velib_data_base_path: HttpUrl | Path, ) -> pd.DataFrame: + columns = SKELETON.tables.STATION_DETAILS.columns stations_data: Any = cast( Any, await read_json( @@ -31,9 +32,9 @@ async def read_station_details( ["station_id", "name", "capacity", "lat", "lon"] ].rename( columns={ - "station_id": StationDetailsTableColumn.ID.value, - "name": StationDetailsTableColumn.NAME.value, - "capacity": StationDetailsTableColumn.CAPACITY.value, + "station_id": columns.ID.name, + "name": columns.NAME.name, + "capacity": columns.CAPACITY.name, "lat": "latitude", "lon": "longitude", } @@ -52,11 +53,11 @@ async def read_station_details( coordinates, reverse_geocoding_path=reverse_geocoding_path ).rename( columns={ - "department": StationDetailsTableColumn.DEPARTMENT.value, - "city": StationDetailsTableColumn.CITY.value, - "postcode": StationDetailsTableColumn.POSTCODE.value, - "street": StationDetailsTableColumn.STREET.value, - "house_number": StationDetailsTableColumn.HOUSE_NUMBER.value, + "department": columns.DEPARTMENT.name, + "city": columns.CITY.name, + "postcode": columns.POSTCODE.name, + "street": columns.STREET.name, + "house_number": columns.HOUSE_NUMBER.name, } ) @@ -71,6 +72,7 @@ async def read_station_status( *, http_client: httpx.AsyncClient, ) -> pd.DataFrame: + columns = SKELETON.tables.STATION_STATUS.columns stations_data = cast( Any, await read_json( @@ -89,11 +91,9 @@ async def read_station_status( bike_type, bikes = next(iter(num_bikes_available_types.items())) station_statuses.append( { - StationStatusTableColumn.STATION_ID.value: station_status[ - "station_id" - ], - StationStatusTableColumn.BIKE_TYPE.value: bike_type, - StationStatusTableColumn.BIKES.value: bikes, + columns.STATION_ID.name: station_status["station_id"], + columns.BIKE_TYPE.name: bike_type, + columns.BIKES.name: bikes, } ) return pd.DataFrame(station_statuses) @@ -120,6 +120,10 @@ async def load_tables( with session.tables.data_transaction(): await asyncio.gather( - session.tables[Table.STATION_DETAILS.value].load_async(station_details_df), - session.tables[Table.STATION_STATUS.value].load_async(station_status_df), + session.tables[SKELETON.tables.STATION_DETAILS.key].load_async( + station_details_df + ), + session.tables[SKELETON.tables.STATION_STATUS.key].load_async( + station_status_df + ), ) diff --git a/app/skeleton.py b/app/skeleton.py new file mode 100644 index 0000000..06f9143 --- /dev/null +++ b/app/skeleton.py @@ -0,0 +1,128 @@ +from .util.skeleton import ( + Column, + Columns, + Cube, + Cubes, + Dimension, + Dimensions, + Hierarchies, + Hierarchy, + Level, + Levels, + Measure, + Measures, + Skeleton, + Table, + Tables, +) + + +class _StationDetailsTableColumns(Columns): + ID = Column("ID") + NAME = Column("Name") + DEPARTMENT = Column("Department") + CITY = Column("City") + POSTCODE = Column("Postcode") + STREET = Column("Street") + HOUSE_NUMBER = Column("House number") + CAPACITY = Column("Capacity") + + +class _StationDetailsTable(Table): + name = "Station details" + columns = _StationDetailsTableColumns() + + +class _StationStatusTableColumns(Columns): + STATION_ID = Column("Station ID") + BIKE_TYPE = Column("Bike type") + BIKES = Column("Bikes") + + +class _StationStatusTable(Table): + name = "Station status" + columns = _StationStatusTableColumns() + + +class _Tables(Tables): + STATION_DETAILS = _StationDetailsTable() + STATION_STATUS = _StationStatusTable() + + +class _StationCubeStationDetailsDimensionLocationHierarchyLevels(Levels): + DEPARTMENT = Level(_StationDetailsTableColumns.DEPARTMENT) + CITY = Level(_StationDetailsTableColumns.CITY) + POSTCODE = Level(_StationDetailsTableColumns.POSTCODE) + STREET = Level(_StationDetailsTableColumns.STREET) + HOUSE_NUMBER = Level(_StationDetailsTableColumns.HOUSE_NUMBER) + + +class _StationCubeStationDetailsDimensionLocationHierarchy(Hierarchy): + name = "Location" + levels = _StationCubeStationDetailsDimensionLocationHierarchyLevels() + + +class _StationCubeStationDetailsDimensionStationHierarchyLevels(Levels): + NAME = Level(_StationDetailsTableColumns.NAME) + ID = Level(_StationDetailsTableColumns.ID) + + +class _StationCubeStationDetailsDimensionStationHierarchy(Hierarchy): + name = "Station" + levels = _StationCubeStationDetailsDimensionStationHierarchyLevels() + + +class _StationCubeStationDetailsDimensionHierarchies(Hierarchies): + LOCATION = _StationCubeStationDetailsDimensionLocationHierarchy() + STATION = _StationCubeStationDetailsDimensionStationHierarchy() + + +class _StationCubeStationDetailsDimension(Dimension): + name = _StationDetailsTable.name + hierarchies = _StationCubeStationDetailsDimensionHierarchies() + + +class _StationCubeStationStatusDimensionBikeTypeHierarchyLevels(Levels): + BIKE_TYPE = Level(_StationStatusTableColumns.BIKE_TYPE) + + +class _StationCubeStationStatusDimensionBikeTypeHierarchy(Hierarchy): + name = _StationStatusTableColumns.BIKE_TYPE.name + levels = _StationCubeStationStatusDimensionBikeTypeHierarchyLevels() + + +class _StationCubeStationStatusDimensionHierarchies(Hierarchies): + BIKE_TYPE = _StationCubeStationStatusDimensionBikeTypeHierarchy() + + +class _StationCubeStationStatusDimension(Dimension): + name = _StationStatusTable.name + hierarchies = _StationCubeStationStatusDimensionHierarchies() + + +class _StationCubeDimensions(Dimensions): + STATION_DETAILS = _StationCubeStationDetailsDimension() + STATION_STATUS = _StationCubeStationStatusDimension() + + +class _StationCubeMeasures(Measures): + CAPACITY = Measure(_StationDetailsTableColumns.CAPACITY.name) + BIKES = Measure(_StationStatusTableColumns.BIKES.name) + + +class _StationCube(Cube): + name = "Station" + dimensions = _StationCubeDimensions() + measures = _StationCubeMeasures() + + +class _Cubes(Cubes): + STATION = _StationCube() + + +class _Skeleton(Skeleton): + cubes = _Cubes() + tables = _Tables() + + +SKELETON = _Skeleton() diff --git a/app/util/__init__.py b/app/util/__init__.py index 5bee464..4f592a3 100644 --- a/app/util/__init__.py +++ b/app/util/__init__.py @@ -1,3 +1,4 @@ +from . import skeleton as skeleton from .normalize_postgres_dsn_for_atoti_jdbc import ( normalize_postgres_dsn_for_atoti_jdbc as normalize_postgres_dsn_for_atoti_jdbc, ) diff --git a/app/util/skeleton/__init__.py b/app/util/skeleton/__init__.py new file mode 100644 index 0000000..36d1dfd --- /dev/null +++ b/app/util/skeleton/__init__.py @@ -0,0 +1,20 @@ +from .column import column as column +from .contributors_count import CONTRIBUTORS_COUNT as CONTRIBUTORS_COUNT +from .fact_based_hierarchy import fact_based_hierarchy as fact_based_hierarchy +from .skeleton import ( + Column as Column, + Columns as Columns, + Cube as Cube, + Cubes as Cubes, + Dimension as Dimension, + Dimensions as Dimensions, + Hierarchies as Hierarchies, + Hierarchy as Hierarchy, + Level as Level, + Levels as Levels, + Measure as Measure, + Measures as Measures, + Skeleton as Skeleton, + Table as Table, + Tables as Tables, +) diff --git a/app/util/skeleton/_node.py b/app/util/skeleton/_node.py new file mode 100644 index 0000000..d2bc974 --- /dev/null +++ b/app/util/skeleton/_node.py @@ -0,0 +1,193 @@ +"""Module containing the node classes required to build a skeleton. + +The code is dense and makes heavy use of reflection but expected invariants are enforced by many assertions. + +This module does not need to be modified to add new nodes to the skeleton. +""" + +from abc import ABC +from typing import ( + ClassVar, + Final, + Generic, + TypeVar, + cast, + final, + get_args, + get_origin, +) + +from typing_extensions import TypeVarTuple, Unpack, get_original_bases, override + + +def _camel_from_pascal(name: str, /) -> str: + match len(name): + case 0: + return "" + case 1: + return name.lower() + case _: + return f"{name[0].lower()}{name[1:]}" + + +def _is_private(name: str, /) -> bool: + return name.startswith("_") + + +_KeyT_co = TypeVar("_KeyT_co", bound=str | tuple[str, ...], covariant=True) + +_ChildT = TypeVar("_ChildT") +_HeterogeneousChildT = TypeVarTuple("_HeterogeneousChildT") + +_LEAF_NODE_CLASS_NAME = "LeafNode" + + +class HeterogeneousNode(Generic[_KeyT_co, Unpack[_HeterogeneousChildT]], ABC): + _child_types: tuple[Unpack[_HeterogeneousChildT]] + _key_length: ClassVar[int] + _path: tuple[str, ...] | None = None + name: str + + @override + def __init_subclass__(cls) -> None: + super().__init_subclass__() + + # Cannot directly reference `LeafNode` at this time. + if cls.__name__ == _LEAF_NODE_CLASS_NAME: + return + + (orig_base,) = cls.__orig_bases__ # type: ignore[attr-defined] + key_type, *child_types = get_args(orig_base) + + if get_origin(orig_base).__name__ == _LEAF_NODE_CLASS_NAME: + assert not child_types, ( + f"Expected leaf {cls.__name__} to not have children but got {child_types}." + ) + cls._child_types = () + else: + assert child_types, ( + f"Expected non-leaf {cls.__name__} to have children but got none." + ) + cls._child_types = tuple(child_types) + + if HeterogeneousNode not in cls.__bases__: + for child_type in cls._child_types: + child = cls._child(child_type) + assert isinstance(child, HomogeneousNode | None), ( + f"Expected {cls.__name__}'s {child_type.__name__} to be an {HomogeneousNode.__name__} but got {type(child).__name__}." + ) + + attribute_names = { + name + for name in dir(cls) + if not _is_private(name) and name not in {"key", "name"} + } + if ( + _LEAF_NODE_CLASS_NAME in {base.__name__ for base in cls.__bases__} + ) or HeterogeneousNode in cls.__bases__: + assert not attribute_names, ( + f"Expected {cls.__name__} to have no attributes but got {attribute_names}." + ) + else: + unexpected_attribute_names = attribute_names - { + _camel_from_pascal(child_type.__name__) + for child_type in cls._child_types + } + assert not unexpected_attribute_names, ( + f"{cls.__name__} has some unexpected attributes: {attribute_names}." + ) + + if key_type is str: + cls._key_length = 1 + else: + assert get_origin(key_type) is tuple + key_length = len(get_args(key_type)) + degenerated_tuple_length = 1 + assert key_length == 0 or key_length > degenerated_tuple_length, ( + "Use `str` instead of `tuple[str]`." + ) + cls._key_length = key_length + + @final + @classmethod + def _child(cls, child_type: type[_ChildT], /) -> _ChildT | None: + attribute_name = _camel_from_pascal(child_type.__name__) + child = getattr(cls, attribute_name, None) + assert isinstance(child, child_type | None), ( + f"Expected {cls.__name__}.{attribute_name} to be a {child_type.__name__} but got {type(child).__name__}." + ) + return child + + @final + @property + def key(self) -> _KeyT_co: + assert self._path is not None, ( + f"The `_path` of the {type(self).__name__} named `{self.name}` should have been set by now." + ) + match self._key_length: + case 0: + return cast(_KeyT_co, ()) + case 1: + return cast(_KeyT_co, self._path[-1]) + case key_length: + return cast(_KeyT_co, self._path[-key_length:]) + + @final + def _set_path(self, *, parent_path: tuple[str, ...]) -> None: + self_part = () if self.name is None else (self.name,) + self._path = (*parent_path, *self_part) + for child_type in self._child_types: + assert isinstance(child_type, type) + assert issubclass(child_type, HomogeneousNode) + child = self._child(child_type) + if child is not None: + child._set_path(parent_path=self._path) # noqa: SLF001 + + +class LeafNode(HeterogeneousNode[_KeyT_co]): + def __init__(self, name: str, /) -> None: + super().__init__() + self.name: Final = name + + +assert LeafNode.__name__ == _LEAF_NODE_CLASS_NAME + +_HomogenousChildT = TypeVar("_HomogenousChildT") + + +class HomogeneousNode(Generic[_HomogenousChildT], ABC): + _child_type: type[_HomogenousChildT] + + @override + def __init_subclass__(cls) -> None: + super().__init_subclass__() + + if HomogeneousNode not in cls.__bases__: + return + + (original_base,) = get_original_bases(cls) + assert get_origin(original_base) is HomogeneousNode + (child_type,) = get_args(original_base) + assert issubclass(child_type, HeterogeneousNode) + cls._child_type = child_type + + @final + @classmethod + def _children(cls) -> dict[str, _HomogenousChildT]: + children: dict[str, _HomogenousChildT] = {} + for name, value in vars(cls).items(): + if _is_private(name): + continue + assert isinstance(value, cls._child_type), ( + f"Expected {cls.__name__}.{name} to be a {cls._child_type.__name__} but got {type(value).__name__}." + ) + children[name] = value + return children + + @final + def _set_path(self, *, parent_path: tuple[str, ...]) -> None: + for value in self._children().values(): + assert isinstance(value, HeterogeneousNode), ( + f"Expected {type(value).__name__} to be an {HeterogeneousNode.__name__}." + ) + value._set_path(parent_path=parent_path) # noqa: SLF001 diff --git a/app/util/skeleton/column.py b/app/util/skeleton/column.py new file mode 100644 index 0000000..720aa51 --- /dev/null +++ b/app/util/skeleton/column.py @@ -0,0 +1,9 @@ +import atoti as tt + +from .skeleton import Column as Column + + +def column(session: tt.Session, column: Column, /) -> tt.Column: + """Atoti has :attr:`atoti.Cube.levels` but no `Tables.columns`, this is the next-best thing.""" + table_name, column_name = column.key + return session.tables[table_name][column_name] diff --git a/app/util/skeleton/contributors_count.py b/app/util/skeleton/contributors_count.py new file mode 100644 index 0000000..5adfdec --- /dev/null +++ b/app/util/skeleton/contributors_count.py @@ -0,0 +1 @@ +CONTRIBUTORS_COUNT = "contributors.COUNT" diff --git a/app/util/skeleton/fact_based_hierarchy.py b/app/util/skeleton/fact_based_hierarchy.py new file mode 100644 index 0000000..d0cf4a1 --- /dev/null +++ b/app/util/skeleton/fact_based_hierarchy.py @@ -0,0 +1,24 @@ +import atoti as tt + +from .column import column +from .skeleton import Column, Hierarchy, Level, Levels + + +def _column(level: Level, /) -> Column: + column = level._column # noqa: SLF001 + assert column is not None, ( + f"Cannot use `{fact_based_hierarchy.__name__}()` with a hierarchy with level `{level.name}` not based on a column." + ) + return column + + +def fact_based_hierarchy( + session: tt.Session, hierarchy: Hierarchy, / +) -> tuple[tuple[str, str], dict[str, tt.Column]]: + """Return the definition of a hierarchy for which all levels are based on columns.""" + levels = hierarchy._child(Levels) # noqa: SLF001 + assert levels is not None + return hierarchy.key, { + level.name: column(session, _column(level)) + for level in levels._children().values() # noqa: SLF001 + } diff --git a/app/util/skeleton/skeleton.py b/app/util/skeleton/skeleton.py new file mode 100644 index 0000000..f1ef2bd --- /dev/null +++ b/app/util/skeleton/skeleton.py @@ -0,0 +1,132 @@ +from typing import Final, final + +from typing_extensions import override + +from ._node import HeterogeneousNode, HomogeneousNode, LeafNode + + +@final +class Column(LeafNode[tuple[str, str]]): ... + + +class Columns(HomogeneousNode[Column]): ... + + +class Table(HeterogeneousNode[str, Columns]): ... + + +class Tables(HomogeneousNode[Table]): ... + + +@final +class Level(LeafNode[tuple[str, str, str]]): + _column: Column | None = None + + def __init__(self, column_or_name: Column | str, /) -> None: + match column_or_name: + case Column() as column: + super().__init__("__pending__") + self._column = column + case str() as name: + super().__init__(name) + + @override # type: ignore[misc] + def _set_path(self, *, parent_path: tuple[str, ...]) -> None: + if self._column is not None: + assert self._column.key is not None, ( + "The column key should have been set by now." + ) + self.name = self._column.name # type: ignore[misc] + + super()._set_path(parent_path=parent_path) + + +class Levels(HomogeneousNode[Level]): ... + + +class Hierarchy(HeterogeneousNode[tuple[str, str], Levels]): ... + + +class Hierarchies(HomogeneousNode[Hierarchy]): ... + + +class Dimension(HeterogeneousNode[str, Hierarchies]): ... + + +class Dimensions(HomogeneousNode[Dimension]): ... + + +@final +class Measure(LeafNode[str]): ... + + +class Measures(HomogeneousNode[Measure]): ... + + +class Cube(HeterogeneousNode[str, Dimensions, Measures]): ... + + +class Cubes(HomogeneousNode[Cube]): ... + + +class Skeleton( + HeterogeneousNode[ + tuple[()], + # Before `Cubes` so that a `Level` referencing a `Column` can access the column's `_path`. + Tables, + Cubes, + ] +): + """The skeleton of a data model. + + It mirrors the structure of the data model but only declares the parent/child relationship between nodes and the name of each node. + + Note: + Attaching other information to the skeleton is discouraged because this will end up duplicating the data model API already provided by Atoti. + For instance, it is discouraged to add a ``data_type`` attribute to ``Column``, or a ``keys`` attribute to ``Table``. + + Skeletons scale well to large data models because IDEs can inspect them statically and thus offer: + + * Autocompletion + * "Find all references" + * "Go to definition" + * Type checking + * Dead code detection + + When instantiated, the skeleton will propagate the path from the root (i.e. this class) to all the nodes, providing easy access to unambiguous keys: + + >>> class _MyCubeFooDimensionBarHierarchyLevels(Levels): + ... BAZ = Level("baz") + >>> class _MyCubeFooDimensionBarHierarchy(Hierarchy): + ... name = "bar" + ... levels = _MyCubeFooDimensionBarHierarchyLevels() + >>> class _MyCubeFooDimensionHierarchies(Hierarchies): + ... BAR = _MyCubeFooDimensionBarHierarchy() + >>> class _MyCubeFooDimension(Dimension): + ... name = "foo" + ... hierarchies = _MyCubeFooDimensionHierarchies() + >>> class _MyCubeDimensions(Dimensions): + ... FOO = _MyCubeFooDimension() + >>> class _MyCube(Cube): + ... name = "my cube" + ... dimensions = _MyCubeDimensions() + >>> class _Cubes(Cubes): + ... MY_CUBE = _MyCube() + >>> class _Skeleton(Skeleton): + ... cubes = _Cubes() + >>> SKELETON = _Skeleton() + >>> SKELETON.cubes.MY_CUBE.dimensions.FOO.key + 'foo' + >>> SKELETON.cubes.MY_CUBE.dimensions.FOO.hierarchies.BAR.key + ('foo', 'bar') + >>> SKELETON.cubes.MY_CUBE.dimensions.FOO.hierarchies.BAR.levels.BAZ.key + ('foo', 'bar', 'baz') + + This works well with :func:`atoti.mapping_lookup` when ``check=False`` since that mode requires unambiguous keys. + + """ + + name: Final = "__root__" + + def __init__(self) -> None: + self._set_path(parent_path=()) diff --git a/pyproject.toml b/pyproject.toml index 22f6e62..0885a6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "pandas", "pydantic", "pydantic-settings", + "typing-extensions", ] [tool.mypy] @@ -22,11 +23,14 @@ module = ["docker", "docker.*"] ignore_missing_imports = true [tool.pytest.ini_options] -addopts = "--strict-markers" +addopts = "--doctest-modules --strict-markers" asyncio_default_fixture_loop_scope = "session" asyncio_mode = "auto" filterwarnings = ["error"] +[tool.ruff.format] +docstring-code-format = true + [tool.ruff.lint] ignore = [ "COM812", # Covered by the formatter. diff --git a/tests/docker/test_docker.py b/tests/docker/test_docker.py index 3bf7938..4ae5ff4 100644 --- a/tests/docker/test_docker.py +++ b/tests/docker/test_docker.py @@ -1,11 +1,12 @@ import atoti as tt -from app import Cube +from app import SKELETON +from app.util.skeleton import CONTRIBUTORS_COUNT def test_session_inside_docker_container( session_inside_docker_container: tt.Session, ) -> None: - cube = session_inside_docker_container.cubes[Cube.STATION.value] - result_df = cube.query(cube.measures["contributors.COUNT"]) - assert result_df["contributors.COUNT"][0] > 0 + cube = session_inside_docker_container.cubes[SKELETON.cubes.STATION.key] + result_df = cube.query(cube.measures[CONTRIBUTORS_COUNT]) + assert result_df[CONTRIBUTORS_COUNT][0] > 0 diff --git a/tests/test_session.py b/tests/test_session.py index 8e9cc6c..3544ebc 100644 --- a/tests/test_session.py +++ b/tests/test_session.py @@ -1,16 +1,16 @@ import atoti as tt import pandas as pd -from app import Cube, StationCubeLocationLevel, StationCubeMeasure +from app import SKELETON +from app.util.skeleton import CONTRIBUTORS_COUNT def test_total_capacity(session: tt.Session) -> None: - station_cube = session.cubes[Cube.STATION.value] - result = station_cube.query( - station_cube.measures[StationCubeMeasure.CAPACITY.value] - ) + skeleton = SKELETON.cubes.STATION + cube = session.cubes[skeleton.key] + result = cube.query(cube.measures[skeleton.measures.CAPACITY.key]) expected_result = pd.DataFrame( - columns=[StationCubeMeasure.CAPACITY.value], + columns=[skeleton.measures.CAPACITY.name], data=[ (45_850), ], @@ -20,10 +20,16 @@ def test_total_capacity(session: tt.Session) -> None: def test_departments(session: tt.Session) -> None: - station_cube = session.cubes[Cube.STATION.value] - result = station_cube.query( - station_cube.measures["contributors.COUNT"], - levels=[station_cube.levels[StationCubeLocationLevel.DEPARTMENT.value]], + skeleton = SKELETON.cubes.STATION + cube = session.cubes[skeleton.key] + l, m = cube.levels, cube.measures + result = cube.query( + m[CONTRIBUTORS_COUNT], + levels=[ + l[ + skeleton.dimensions.STATION_DETAILS.hierarchies.LOCATION.levels.DEPARTMENT.key + ] + ], ) assert list(result.index) == [ "75, Paris, Île-de-France", diff --git a/uv.lock b/uv.lock index 3303054..a2a5cb4 100644 --- a/uv.lock +++ b/uv.lock @@ -40,6 +40,7 @@ dependencies = [ { name = "pandas" }, { name = "pydantic" }, { name = "pydantic-settings" }, + { name = "typing-extensions" }, ] [package.dev-dependencies] @@ -59,6 +60,7 @@ requires-dist = [ { name = "pandas" }, { name = "pydantic" }, { name = "pydantic-settings" }, + { name = "typing-extensions" }, ] [package.metadata.requires-dev]