diff --git a/.github/workflows/ipynb_ci.yml b/.github/workflows/ipynb_ci.yml index 012c840..df91d2e 100644 --- a/.github/workflows/ipynb_ci.yml +++ b/.github/workflows/ipynb_ci.yml @@ -21,13 +21,15 @@ jobs: name: Check ${{ matrix.nb-file }} Notebook Execution steps: - uses: actions/checkout@v3 - - uses: mamba-org/provision-with-micromamba@main + - uses: mamba-org/setup-micromamba@main with: - environment-file: false environment-name: temp - channels: defaults,conda-forge - channel-priority: flexible - extra-specs: | + condarc: | + channels: + - defaults + - conda-forge + channel_priority: flexible + create-args: | python=3.11 - name: Install dependencies run: | diff --git a/.github/workflows/reproduce_paper.yml b/.github/workflows/reproduce_paper.yml index 35d10b5..fc264c6 100644 --- a/.github/workflows/reproduce_paper.yml +++ b/.github/workflows/reproduce_paper.yml @@ -25,13 +25,15 @@ jobs: name: Reproduce Paper Data Splits steps: - uses: actions/checkout@v3 - - uses: mamba-org/provision-with-micromamba@main + - uses: mamba-org/setup-micromamba@main with: - environment-file: false environment-name: temp - channels: defaults,conda-forge - channel-priority: flexible - extra-specs: | + condarc: | + channels: + - defaults + - conda-forge + channel_priority: flexible + create-args: | python=3.11 - name: Install Dependencies run: | diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b5da95f..a306924 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -28,13 +28,15 @@ jobs: name: ${{ matrix.os }} Python ${{ matrix.python-version }} Subtest steps: - uses: actions/checkout@v3 - - uses: mamba-org/provision-with-micromamba@main + - uses: mamba-org/setup-micromamba@main with: - environment-file: false environment-name: temp - channels: defaults,conda-forge - channel-priority: flexible - extra-specs: | + condarc: | + channels: + - defaults + - conda-forge + channel_priority: flexible + create-args: | python=${{ matrix.python-version }} - name: Install Dependencies run: | diff --git a/README.md b/README.md index 5636312..e948978 100644 --- a/README.md +++ b/README.md @@ -126,15 +126,19 @@ Configuration options for the featurization scheme can be found in the documenta To that end, the default behavior of `astartes` is to use `42` as the random seed and _always_ set it. Running `astartes` with the default settings will always produce the exact same results. We have verified this behavior on Debian Ubuntu, Windows, and Intel Macs from Python versions 3.7 through 3.11 (with appropriate dependencies for each version). -We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon. It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical. `astartes` is still consistent between runs on the same platform in all cases. -## Evaluate the impact of splitting algorithms +> **Note** +> We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon. +It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical. +`astartes` is still consistent between runs on the same platform in all cases, and other samplers are not impacted by this apparent bug. + +## Evaluate the Impact of Splitting Algorithms The `generate_regression_results_dict` function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument. ``` from sklearn.svm import LinearSVR -from astartes.utils.utils import generate_regression_results_dict +from astartes.utils import generate_regression_results_dict sklearn_model = LinearSVR() results_dict = generate_regression_results_dict( diff --git a/astartes/utils/__init__.py b/astartes/utils/__init__.py index e69de29..141f516 100644 --- a/astartes/utils/__init__.py +++ b/astartes/utils/__init__.py @@ -0,0 +1,7 @@ +# import functions from this directory's contents so that users can import +# them with `from astartes.utils import *` +# internally, we do NOT do this to make the imports more explicit, i.e. +# `from astartes.utils.exceptions import *` +from .user_utils import generate_regression_results_dict + +__all__ = ["generate_regression_results_dict"] diff --git a/astartes/utils/utils.py b/astartes/utils/user_utils.py similarity index 94% rename from astartes/utils/utils.py rename to astartes/utils/user_utils.py index 8e65f7a..41ff343 100644 --- a/astartes/utils/utils.py +++ b/astartes/utils/user_utils.py @@ -2,8 +2,7 @@ from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from tabulate import tabulate -from astartes import train_val_test_split -from astartes.utils.exceptions import InvalidModelTypeError +import astartes def generate_regression_results_dict( @@ -57,7 +56,9 @@ def generate_regression_results_dict( } """ if not isinstance(sklearn_model, sklearn.base.BaseEstimator): - raise InvalidModelTypeError("Model must be an sklearn model") + raise astartes.utils.exceptions.InvalidModelTypeError( + "Model must be an sklearn model" + ) final_dict = {} for sampler in samplers: @@ -80,7 +81,14 @@ def generate_regression_results_dict( } # obtain indices - _, _, _, train_indices, val_indices, test_indices = train_val_test_split( + ( + _, + _, + _, + train_indices, + val_indices, + test_indices, + ) = astartes.train_val_test_split( X, train_size=train_size, val_size=val_size, diff --git a/pyproject.toml b/pyproject.toml index 4018da7..027dbce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "astartes" -version = "1.0.3" +version = "1.1.0" authors = [ { name = "Jackson Burns", email = "jwburns@mit.edu" }, { name = "Himaghna Bhattacharjee", email = "himaghna@udel.edu" }, diff --git a/test/unit/utils/test_utils.py b/test/unit/utils/test_utils.py index bad81d4..3412ca1 100644 --- a/test/unit/utils/test_utils.py +++ b/test/unit/utils/test_utils.py @@ -4,8 +4,8 @@ from sklearn.svm import LinearSVR from astartes.samplers.interpolation import Random +from astartes.utils import generate_regression_results_dict from astartes.utils.exceptions import InvalidModelTypeError -from astartes.utils.utils import generate_regression_results_dict class Test_utils(unittest.TestCase):