🐛 Minor bug fixes and coverage improvements (#23)

Co-authored-by: Ben Rutter
benrutter · Jan 3, 2025 · c902626 · c902626
1 parent 20a9b47
commit c902626
Show file tree

Hide file tree

Showing 16 changed files with 216 additions and 37 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -83,6 +83,6 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ github.token }}
         run: >-
-          gh release upload
-          '${{ github.ref_name }}' dist/**
+          TAG_NAME=$(date '+%Y%m%d%H%M%S')
+          gh release upload "$TAG_NAME" dist/**
           --repo '${{ github.repository }}'
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -19,4 +19,6 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install -r requirements-dev.lock
       - name: Run tests
-        run: python -m pytest
+        run: python -m coverage run -m pytest
+      - name: Check coverage
+        run: python -m coverage report --fail-under=100
diff --git a/README.md b/README.md
@@ -3,7 +3,9 @@
 [![PyPI version](https://badge.fury.io/py/wimsey.svg)](https://pypi.org/project/wimsey/)
 [![License](https://img.shields.io/github/license/benrutter/wimsey)](https://github.com/benrutter/wimsey/blob/main/LICENSE)
 [![Static Badge](https://img.shields.io/badge/Docs-mkdocs-blue)](https://benrutter.github.io/wimsey)
+![coverage](https://img.shields.io/badge/coverage-100-green)
 
+<img src="./docs/assets/wimsey-on-a-computer.jpg" alt="A line drawing of Lord Peter Wimsey looking at a computer through a microscope" width="300" />
 
 A lightweight, flexible and fully open-source data contract library.
 

diff --git a/docs/assets/wimsey-on-a-computer.jpg b/docs/assets/wimsey-on-a-computer.jpg
diff --git a/docs/assets/wimsey-on-a-computer.jpg:Zone.Identifier b/docs/assets/wimsey-on-a-computer.jpg:Zone.Identifier
@@ -0,0 +1,3 @@
+[ZoneTransfer]
+ZoneId=3
+HostUrl=about:internet
diff --git a/docs/index.md b/docs/index.md
@@ -1,5 +1,6 @@
 # Wimsey 🔍
 
+<img src="./assets/wimsey-on-a-computer.jpg" alt="A line drawing of Lord Peter Wimsey looking at a computer through a microscope" width="300" />
 
 Wimsey is a lightweight, flexible and fully open-source data contract library. It's designed to let you:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,8 @@ dev-dependencies = [
     "mkdocs-material>=9.5.42",
     "grip>=4.6.2",
     "mkdocs-charts-plugin>=0.0.12",
+    "coverage>=7.6.10",
+    "pylint>=3.3.3",
 ]
 
 [tool.hatch.metadata]

diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -10,6 +10,8 @@
 #   universal: false
 
 -e file:.
+astroid==3.3.8
+    # via pylint
 asttokens==2.4.1
     # via stack-data
 babel==2.16.0
@@ -28,12 +30,15 @@ cloudpickle==3.1.0
     # via dask
 colorama==0.4.6
     # via mkdocs-material
+coverage==7.6.10
 dask==2024.10.0
     # via dask-expr
 dask-expr==1.1.16
 decorator==5.1.1
     # via ipdb
     # via ipython
+dill==0.3.9
+    # via pylint
 docopt==0.6.2
     # via grip
 exceptiongroup==1.2.0
@@ -58,6 +63,8 @@ iniconfig==2.0.0
 ipdb==0.13.13
 ipython==8.21.0
     # via ipdb
+isort==5.13.2
+    # via pylint
 itsdangerous==2.2.0
     # via flask
 jedi==0.19.1
@@ -79,6 +86,8 @@ markupsafe==3.0.2
     # via werkzeug
 matplotlib-inline==0.1.6
     # via ipython
+mccabe==0.7.0
+    # via pylint
 mergedeep==1.3.4
     # via mkdocs
     # via mkdocs-get-deps
@@ -119,6 +128,7 @@ pexpect==4.9.0
     # via ipython
 platformdirs==4.3.6
     # via mkdocs-get-deps
+    # via pylint
 pluggy==1.5.0
     # via pytest
 polars==1.10.0
@@ -134,6 +144,7 @@ pygments==2.17.2
     # via grip
     # via ipython
     # via mkdocs-material
+pylint==3.3.3
 pymdown-extensions==10.11.2
     # via mkdocs-charts-plugin
     # via mkdocs-material
@@ -164,7 +175,10 @@ stack-data==0.6.3
 tomli==2.0.1
     # via ipdb
     # via mypy
+    # via pylint
     # via pytest
+tomlkit==0.13.2
+    # via pylint
 toolz==1.0.0
     # via dask
     # via partd
@@ -173,6 +187,7 @@ traitlets==5.14.1
     # via matplotlib-inline
 types-pyyaml==6.0.12.20240917
 typing-extensions==4.12.2
+    # via astroid
     # via mypy
 tzdata==2024.2
     # via pandas

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -60,6 +60,42 @@ def open_file_patch(*args, **kwargs):
     actual = config.read_config("file.yaml")
     assert all(isinstance(i, Callable) for i in actual)
 
+def test_read_config_parses_yaml_with_test_section(monkeypatch, test_suite):
+    class DummyOpenFile:
+        def __enter__(self, *args, **kwargs):
+            return self
+
+        def __exit__(self, *args, **kwargs):
+            ...
+
+        def read(self, *args, **kwargs):
+            return yaml.dump({"cool": ["some", "cool", "stuff"], "tests": test_suite})
+
+    def open_file_patch(*args, **kwargs):
+        return DummyOpenFile()
+
+    monkeypatch.setattr(config.fsspec, "open", open_file_patch)
+    actual = config.read_config("file.yaml")
+    assert all(isinstance(i, Callable) for i in actual)
+
+def test_read_config_parses_yaml_with_only_one_test(monkeypatch, test_suite):
+    class DummyOpenFile:
+        def __enter__(self, *args, **kwargs):
+            return self
+
+        def __exit__(self, *args, **kwargs):
+            ...
+
+        def read(self, *args, **kwargs):
+            return yaml.dump(test_suite[0])
+
+    def open_file_patch(*args, **kwargs):
+        return DummyOpenFile()
+
+    monkeypatch.setattr(config.fsspec, "open", open_file_patch)
+    actual = config.read_config("file.yaml")
+    assert all(isinstance(i, Callable) for i in actual)
+
 
 def test_read_config_parses_json(monkeypatch, test_suite):
     class DummyOpenFile:

diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py
@@ -12,6 +12,15 @@ def test_that_describe_returns_expected_dictionary_for_df() -> None:
     assert actual["length"] == 3
     assert actual["columns"] == "a_^&^_b"
 
+def test_that_describe_returns_expected_dictionary_for_lazy_frame() -> None:
+    df = pl.LazyFrame({"a": [1.2, 1.3, 1.4], "b": ["one", "two", None]})
+    actual = dataframe.describe(df)
+    assert 1.29 < actual["mean_a"] < 1.31
+    assert actual["null_count_b"] == 1
+    assert 0.332 < actual["null_percentage_b"] < 0.334
+    assert actual["length"] == 3
+    assert actual["columns"] == "a_^&^_b"
+
 
 def test_that_describe_excludes_non_specified_columns() -> None:
     df = pl.DataFrame({"a": [1.2, 1.3, 1.4], "b": ["one", "two", None]})
@@ -56,3 +65,8 @@ def test_that_profile_from_samples_returns_list_of_dicts_of_expected_length() ->
     assert len(actual) == 20
     assert actual[10]["mean_a"] == 1.3
     assert actual[4]["columns"] == "a_^&^_b"
+
+def test_that_describe_returns_empty_dict_for_empty_dataframe() -> None:
+    actual = dataframe.describe(pl.DataFrame())
+    assert isinstance(actual, dict)
+    assert len(actual) == 0
diff --git a/tests/test_executions.py b/tests/test_executions.py
@@ -1,4 +1,5 @@
 import polars as pl
+import pytest
 
 from wimsey import execution
 from wimsey import tests
@@ -7,11 +8,112 @@
 def test_run_all_tests_produces_expected_result_object():
     tests_to_carry_out = [
         tests.max_should(column="a", be_less_than=10),
-        tests.std_should(column="a", be_greated_than=0),
+        tests.std_should(column="a", be_greater_than=0),
         tests.type_should(column="b", be_one_of=["string", "int64"]),
     ]
     df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
     actual = execution.run_all_tests(df, tests_to_carry_out)
     assert actual.success is True
     for result in actual.results:
         assert result.success is True
+
+def test_validate_carries_out_tests_then_returns_object_if_passing():
+    tests_to_carry_out = [
+        tests.max_should(column="a", be_less_than=10),
+        tests.std_should(column="a", be_greater_than=0),
+        tests.type_should(column="b", be_one_of=["string", "int64"]),
+    ]
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
+    actual = execution.validate(df, tests_to_carry_out)
+    assert isinstance(actual, pl.DataFrame)
+
+def test_validate_raises_error_if_tests_fail():
+    tests_to_carry_out = [
+        tests.max_should(column="a", be_less_than=0),
+        tests.std_should(column="a", be_greater_than=10),
+        tests.type_should(column="b", be_one_of=["string", "int64"]),
+    ]
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
+    with pytest.raises(execution.DataValidationException):
+        execution.validate(df, tests_to_carry_out)
+
+
+def test_row_count_expectations_pass_when_expected():
+    tests_to_carry_out = [
+        tests.row_count_should(
+            be_less_than=3.1,
+            be_less_than_or_equal_to=3,
+            be_greater_than=-2.343,
+            be_greater_than_or_equal_to=0.3,
+            be_exactly=3,
+        )
+    ]
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
+    actual = execution.run_all_tests(df, tests_to_carry_out)
+    assert actual.success is True
+    for result in actual.results:
+        assert result.success is True
+
+def test_columns_should_have_expectations_fail_when_expected():
+    tests_to_carry_out = [
+        tests.columns_should(
+            have="c",
+            not_have="a",
+            be=["b", "c"],
+        )
+    ]
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
+    actual = execution.run_all_tests(df, tests_to_carry_out)
+    assert actual.success is False
+
+def test_column_type_tests_pass_when_expected():
+    tests_to_carry_out = [
+        tests.type_should(column="a", be="int64"),
+        tests.type_should(column="a", be_one_of=["int64", "float64"]),
+        tests.type_should(column="a", not_be="float64"),
+        tests.type_should(column="b", be="string")
+    ]
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["hat", "bat", "cat"]})
+    actual = execution.run_all_tests(df, tests_to_carry_out)
+    assert actual.success is True
+    for result in actual.results:
+        assert result.success is True
+
+
+def test_average_column_difference_tests_pass_when_expected():
+    tests_to_carry_out = [
+        tests.average_difference_from_other_column_should(
+            column="a",
+            other_column="b",
+            be_exactly=0,
+            be_less_than=2,
+            be_greater_than=-1,
+            be_less_than_or_equal_to=0,
+            be_greater_than_or_equal_to=0,
+        ),
+    ]
+    df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    actual = execution.run_all_tests(df, tests_to_carry_out)
+    assert actual.success is True
+    for result in actual.results:
+        assert result.success is True
+
+
+def test_average_column_ratio_tests_pass_when_expected():
+    tests_to_carry_out = [
+        tests.average_ratio_to_other_column_should(
+            column="b",
+            other_column="a",
+            be_exactly=2,
+            be_less_than=3,
+            be_greater_than=1,
+            be_less_than_or_equal_to=2,
+            be_greater_than_or_equal_to=2,
+        ),
+    ]
+    df = pl.DataFrame({"a": [1, 2, 3], "b": [2, 4, 6]})
+    actual = execution.run_all_tests(df, tests_to_carry_out)
+    assert actual.success is True
+    for result in actual.results:
+        assert result.success is True
+
diff --git a/tests/test_tests.py b/tests/test_tests.py
@@ -17,7 +17,7 @@ def test_all_possible_tests_exposed_as_variables_of_the_same_name_in_module() ->
 
 def test_average_ratio_to_other_column_should_matches_expected() -> None:
     test = tests.average_ratio_to_other_column_should(
-        "a", "b", be_greater_than=0.09, be_less_than=0.11
+        "a", "b", be_greater_than=0.09, be_less_than=0.11,
     )
     passing_result = test({"mean_a": 13, "mean_b": 130})
     failing_result = test({"mean_a": 13, "mean_b": 160})

diff --git a/wimsey/_version.py b/wimsey/_version.py
@@ -1 +1 @@
-__version__ = "0.4.0"
+__version__ = "0.4.1"
diff --git a/wimsey/execution.py b/wimsey/execution.py
@@ -4,14 +4,14 @@
 from narwhals.typing import FrameT
 
 from wimsey.dataframe import describe
-from wimsey.tests import result
+from wimsey.tests import Result
 from wimsey.config import read_config, collect_tests
 
 
 @dataclass
-class final_result:
+class FinalResult:
     success: bool
-    results: list[result]
+    results: list[Result]
 
 
 class DataValidationException(Exception):
@@ -26,15 +26,18 @@ def _as_set(val: Any) -> set:
     return {val} if val is not None else set()
 
 
-def run_all_tests(df: FrameT, tests: list[Callable[[Any], result]]) -> final_result:
+def run_all_tests(df: FrameT, tests: list[Callable[[Any], Result]]) -> FinalResult:
+    """
+    Run all given tests on a dataframe. Will return a `FinalResult` object
+    """
     columns: set[str] | None = set()
     metrics: set[str] | None = set()
     for test in tests:
         try:
             metrics |= test.required_metrics
             columns |= _as_set(test.keywords.get("column"))
             columns |= _as_set(test.keywords.get("other_column"))
-        except AttributeError:
+        except AttributeError:  # pragma: no cover
             columns = None  # fall back to calculating everything
             metrics = None
             break
@@ -43,18 +46,18 @@ def run_all_tests(df: FrameT, tests: list[Callable[[Any], result]]) -> final_res
         columns=list(columns),
         metrics=list(metrics),
     )
-    results: list[result] = []
+    results: list[Result] = []
     for i_test in tests:
         results.append(i_test(description))
-    return final_result(
+    return FinalResult(
         success=all(i.success for i in results),
         results=results,
     )
 
 
 def test(
     df: FrameT, contract: str | list[dict] | dict, storage_options: dict | None = None
-) -> final_result:
+) -> FinalResult:
     """
     Carry out tests on dataframe and return results. This will *not* raise
     an exception on test failure, and will instead return a 'final_result'