Skip to content

Commit

Permalink
🐛 Minor bug fixes and coverage improvements (#23)
Browse files Browse the repository at this point in the history
Co-authored-by: Ben Rutter
  • Loading branch information
benrutter authored and Ben Rutter committed Jan 3, 2025
1 parent 20a9b47 commit c902626
Show file tree
Hide file tree
Showing 16 changed files with 216 additions and 37 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,6 @@ jobs:
env:
GITHUB_TOKEN: ${{ github.token }}
run: >-
gh release upload
'${{ github.ref_name }}' dist/**
TAG_NAME=$(date '+%Y%m%d%H%M%S')
gh release upload "$TAG_NAME" dist/**
--repo '${{ github.repository }}'
4 changes: 3 additions & 1 deletion .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@ jobs:
python -m pip install --upgrade pip
python -m pip install -r requirements-dev.lock
- name: Run tests
run: python -m pytest
run: python -m coverage run -m pytest
- name: Check coverage
run: python -m coverage report --fail-under=100
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
[![PyPI version](https://badge.fury.io/py/wimsey.svg)](https://pypi.org/project/wimsey/)
[![License](https://img.shields.io/github/license/benrutter/wimsey)](https://github.com/benrutter/wimsey/blob/main/LICENSE)
[![Static Badge](https://img.shields.io/badge/Docs-mkdocs-blue)](https://benrutter.github.io/wimsey)
![coverage](https://img.shields.io/badge/coverage-100-green)

<img src="./docs/assets/wimsey-on-a-computer.jpg" alt="A line drawing of Lord Peter Wimsey looking at a computer through a microscope" width="300" />

A lightweight, flexible and fully open-source data contract library.

Expand Down
Binary file added docs/assets/wimsey-on-a-computer.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions docs/assets/wimsey-on-a-computer.jpg:Zone.Identifier
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[ZoneTransfer]
ZoneId=3
HostUrl=about:internet
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Wimsey 🔍

<img src="./assets/wimsey-on-a-computer.jpg" alt="A line drawing of Lord Peter Wimsey looking at a computer through a microscope" width="300" />

Wimsey is a lightweight, flexible and fully open-source data contract library. It's designed to let you:

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ dev-dependencies = [
"mkdocs-material>=9.5.42",
"grip>=4.6.2",
"mkdocs-charts-plugin>=0.0.12",
"coverage>=7.6.10",
"pylint>=3.3.3",
]

[tool.hatch.metadata]
Expand Down
15 changes: 15 additions & 0 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
# universal: false

-e file:.
astroid==3.3.8
# via pylint
asttokens==2.4.1
# via stack-data
babel==2.16.0
Expand All @@ -28,12 +30,15 @@ cloudpickle==3.1.0
# via dask
colorama==0.4.6
# via mkdocs-material
coverage==7.6.10
dask==2024.10.0
# via dask-expr
dask-expr==1.1.16
decorator==5.1.1
# via ipdb
# via ipython
dill==0.3.9
# via pylint
docopt==0.6.2
# via grip
exceptiongroup==1.2.0
Expand All @@ -58,6 +63,8 @@ iniconfig==2.0.0
ipdb==0.13.13
ipython==8.21.0
# via ipdb
isort==5.13.2
# via pylint
itsdangerous==2.2.0
# via flask
jedi==0.19.1
Expand All @@ -79,6 +86,8 @@ markupsafe==3.0.2
# via werkzeug
matplotlib-inline==0.1.6
# via ipython
mccabe==0.7.0
# via pylint
mergedeep==1.3.4
# via mkdocs
# via mkdocs-get-deps
Expand Down Expand Up @@ -119,6 +128,7 @@ pexpect==4.9.0
# via ipython
platformdirs==4.3.6
# via mkdocs-get-deps
# via pylint
pluggy==1.5.0
# via pytest
polars==1.10.0
Expand All @@ -134,6 +144,7 @@ pygments==2.17.2
# via grip
# via ipython
# via mkdocs-material
pylint==3.3.3
pymdown-extensions==10.11.2
# via mkdocs-charts-plugin
# via mkdocs-material
Expand Down Expand Up @@ -164,7 +175,10 @@ stack-data==0.6.3
tomli==2.0.1
# via ipdb
# via mypy
# via pylint
# via pytest
tomlkit==0.13.2
# via pylint
toolz==1.0.0
# via dask
# via partd
Expand All @@ -173,6 +187,7 @@ traitlets==5.14.1
# via matplotlib-inline
types-pyyaml==6.0.12.20240917
typing-extensions==4.12.2
# via astroid
# via mypy
tzdata==2024.2
# via pandas
Expand Down
36 changes: 36 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,42 @@ def open_file_patch(*args, **kwargs):
actual = config.read_config("file.yaml")
assert all(isinstance(i, Callable) for i in actual)

def test_read_config_parses_yaml_with_test_section(monkeypatch, test_suite):
class DummyOpenFile:
def __enter__(self, *args, **kwargs):
return self

def __exit__(self, *args, **kwargs):
...

def read(self, *args, **kwargs):
return yaml.dump({"cool": ["some", "cool", "stuff"], "tests": test_suite})

def open_file_patch(*args, **kwargs):
return DummyOpenFile()

monkeypatch.setattr(config.fsspec, "open", open_file_patch)
actual = config.read_config("file.yaml")
assert all(isinstance(i, Callable) for i in actual)

def test_read_config_parses_yaml_with_only_one_test(monkeypatch, test_suite):
class DummyOpenFile:
def __enter__(self, *args, **kwargs):
return self

def __exit__(self, *args, **kwargs):
...

def read(self, *args, **kwargs):
return yaml.dump(test_suite[0])

def open_file_patch(*args, **kwargs):
return DummyOpenFile()

monkeypatch.setattr(config.fsspec, "open", open_file_patch)
actual = config.read_config("file.yaml")
assert all(isinstance(i, Callable) for i in actual)


def test_read_config_parses_json(monkeypatch, test_suite):
class DummyOpenFile:
Expand Down
14 changes: 14 additions & 0 deletions tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@ def test_that_describe_returns_expected_dictionary_for_df() -> None:
assert actual["length"] == 3
assert actual["columns"] == "a_^&^_b"

def test_that_describe_returns_expected_dictionary_for_lazy_frame() -> None:
df = pl.LazyFrame({"a": [1.2, 1.3, 1.4], "b": ["one", "two", None]})
actual = dataframe.describe(df)
assert 1.29 < actual["mean_a"] < 1.31
assert actual["null_count_b"] == 1
assert 0.332 < actual["null_percentage_b"] < 0.334
assert actual["length"] == 3
assert actual["columns"] == "a_^&^_b"


def test_that_describe_excludes_non_specified_columns() -> None:
df = pl.DataFrame({"a": [1.2, 1.3, 1.4], "b": ["one", "two", None]})
Expand Down Expand Up @@ -56,3 +65,8 @@ def test_that_profile_from_samples_returns_list_of_dicts_of_expected_length() ->
assert len(actual) == 20
assert actual[10]["mean_a"] == 1.3
assert actual[4]["columns"] == "a_^&^_b"

def test_that_describe_returns_empty_dict_for_empty_dataframe() -> None:
actual = dataframe.describe(pl.DataFrame())
assert isinstance(actual, dict)
assert len(actual) == 0
104 changes: 103 additions & 1 deletion tests/test_executions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import polars as pl
import pytest

from wimsey import execution
from wimsey import tests
Expand All @@ -7,11 +8,112 @@
def test_run_all_tests_produces_expected_result_object():
tests_to_carry_out = [
tests.max_should(column="a", be_less_than=10),
tests.std_should(column="a", be_greated_than=0),
tests.std_should(column="a", be_greater_than=0),
tests.type_should(column="b", be_one_of=["string", "int64"]),
]
df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
actual = execution.run_all_tests(df, tests_to_carry_out)
assert actual.success is True
for result in actual.results:
assert result.success is True

def test_validate_carries_out_tests_then_returns_object_if_passing():
tests_to_carry_out = [
tests.max_should(column="a", be_less_than=10),
tests.std_should(column="a", be_greater_than=0),
tests.type_should(column="b", be_one_of=["string", "int64"]),
]
df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
actual = execution.validate(df, tests_to_carry_out)
assert isinstance(actual, pl.DataFrame)

def test_validate_raises_error_if_tests_fail():
tests_to_carry_out = [
tests.max_should(column="a", be_less_than=0),
tests.std_should(column="a", be_greater_than=10),
tests.type_should(column="b", be_one_of=["string", "int64"]),
]
df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
with pytest.raises(execution.DataValidationException):
execution.validate(df, tests_to_carry_out)


def test_row_count_expectations_pass_when_expected():
tests_to_carry_out = [
tests.row_count_should(
be_less_than=3.1,
be_less_than_or_equal_to=3,
be_greater_than=-2.343,
be_greater_than_or_equal_to=0.3,
be_exactly=3,
)
]
df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
actual = execution.run_all_tests(df, tests_to_carry_out)
assert actual.success is True
for result in actual.results:
assert result.success is True

def test_columns_should_have_expectations_fail_when_expected():
tests_to_carry_out = [
tests.columns_should(
have="c",
not_have="a",
be=["b", "c"],
)
]
df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
actual = execution.run_all_tests(df, tests_to_carry_out)
assert actual.success is False

def test_column_type_tests_pass_when_expected():
tests_to_carry_out = [
tests.type_should(column="a", be="int64"),
tests.type_should(column="a", be_one_of=["int64", "float64"]),
tests.type_should(column="a", not_be="float64"),
tests.type_should(column="b", be="string")
]
df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
actual = execution.run_all_tests(df, tests_to_carry_out)
assert actual.success is True
for result in actual.results:
assert result.success is True


def test_average_column_difference_tests_pass_when_expected():
tests_to_carry_out = [
tests.average_difference_from_other_column_should(
column="a",
other_column="b",
be_exactly=0,
be_less_than=2,
be_greater_than=-1,
be_less_than_or_equal_to=0,
be_greater_than_or_equal_to=0,
),
]
df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
actual = execution.run_all_tests(df, tests_to_carry_out)
assert actual.success is True
for result in actual.results:
assert result.success is True


def test_average_column_ratio_tests_pass_when_expected():
tests_to_carry_out = [
tests.average_ratio_to_other_column_should(
column="b",
other_column="a",
be_exactly=2,
be_less_than=3,
be_greater_than=1,
be_less_than_or_equal_to=2,
be_greater_than_or_equal_to=2,
),
]
df = pl.DataFrame({"a": [1, 2, 3], "b": [2, 4, 6]})
actual = execution.run_all_tests(df, tests_to_carry_out)
assert actual.success is True
for result in actual.results:
assert result.success is True

2 changes: 1 addition & 1 deletion tests/test_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_all_possible_tests_exposed_as_variables_of_the_same_name_in_module() ->

def test_average_ratio_to_other_column_should_matches_expected() -> None:
test = tests.average_ratio_to_other_column_should(
"a", "b", be_greater_than=0.09, be_less_than=0.11
"a", "b", be_greater_than=0.09, be_less_than=0.11,
)
passing_result = test({"mean_a": 13, "mean_b": 130})
failing_result = test({"mean_a": 13, "mean_b": 160})
Expand Down
2 changes: 1 addition & 1 deletion wimsey/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.0"
__version__ = "0.4.1"
19 changes: 11 additions & 8 deletions wimsey/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
from narwhals.typing import FrameT

from wimsey.dataframe import describe
from wimsey.tests import result
from wimsey.tests import Result
from wimsey.config import read_config, collect_tests


@dataclass
class final_result:
class FinalResult:
success: bool
results: list[result]
results: list[Result]


class DataValidationException(Exception):
Expand All @@ -26,15 +26,18 @@ def _as_set(val: Any) -> set:
return {val} if val is not None else set()


def run_all_tests(df: FrameT, tests: list[Callable[[Any], result]]) -> final_result:
def run_all_tests(df: FrameT, tests: list[Callable[[Any], Result]]) -> FinalResult:
"""
Run all given tests on a dataframe. Will return a `FinalResult` object
"""
columns: set[str] | None = set()
metrics: set[str] | None = set()
for test in tests:
try:
metrics |= test.required_metrics
columns |= _as_set(test.keywords.get("column"))
columns |= _as_set(test.keywords.get("other_column"))
except AttributeError:
except AttributeError: # pragma: no cover
columns = None # fall back to calculating everything
metrics = None
break
Expand All @@ -43,18 +46,18 @@ def run_all_tests(df: FrameT, tests: list[Callable[[Any], result]]) -> final_res
columns=list(columns),
metrics=list(metrics),
)
results: list[result] = []
results: list[Result] = []
for i_test in tests:
results.append(i_test(description))
return final_result(
return FinalResult(
success=all(i.success for i in results),
results=results,
)


def test(
df: FrameT, contract: str | list[dict] | dict, storage_options: dict | None = None
) -> final_result:
) -> FinalResult:
"""
Carry out tests on dataframe and return results. This will *not* raise
an exception on test failure, and will instead return a 'final_result'
Expand Down
Loading

0 comments on commit c902626

Please sign in to comment.