diff --git a/.github/workflows/reproduce_paper.yml b/.github/workflows/reproduce_paper.yml index fc264c60..33a598d3 100644 --- a/.github/workflows/reproduce_paper.yml +++ b/.github/workflows/reproduce_paper.yml @@ -8,12 +8,10 @@ on: - cron: "0 8 * * 1" push: branches: [main] - pull_request: - branches: [main] workflow_dispatch: concurrency: - group: actions-id-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + group: actions-id-${{ github.workflow }} cancel-in-progress: true jobs: @@ -25,6 +23,7 @@ jobs: name: Reproduce Paper Data Splits steps: - uses: actions/checkout@v3 + - run: git checkout v1.0.0 - uses: mamba-org/setup-micromamba@main with: environment-name: temp @@ -38,6 +37,7 @@ jobs: - name: Install Dependencies run: | python -m pip install -e .[molecules] + python -m pip install scikit-learn==1.2.2 python -m pip install notebook - name: Backup Reference Splits run: | diff --git a/README.md b/README.md index e9489786..6e11ce21 100644 --- a/README.md +++ b/README.md @@ -127,11 +127,15 @@ To that end, the default behavior of `astartes` is to use `42` as the random see Running `astartes` with the default settings will always produce the exact same results. We have verified this behavior on Debian Ubuntu, Windows, and Intel Macs from Python versions 3.7 through 3.11 (with appropriate dependencies for each version). +#### Known Reproducibility Limitations + > **Note** > We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon. It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical. `astartes` is still consistent between runs on the same platform in all cases, and other samplers are not impacted by this apparent bug. + - `sklearn` v1.3.0 introduced backwards-incompatible changes in the `KMeans` sampler that changed how the random initialization affects the results, even given the same random seed. Different version of `sklearn` will affect the performance of `astartes` and we recommend including the exact version of `scikit-learn` and `astartes` used, when applicable. + ## Evaluate the Impact of Splitting Algorithms The `generate_regression_results_dict` function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument. diff --git a/pyproject.toml b/pyproject.toml index 027dbce7..82b9d317 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "astartes" -version = "1.1.0" +version = "1.1.1" authors = [ { name = "Jackson Burns", email = "jwburns@mit.edu" }, { name = "Himaghna Bhattacharjee", email = "himaghna@udel.edu" }, @@ -21,7 +21,7 @@ classifiers = [ ] urls = { Homepage = "https://github.com/JacksonBurns/astartes" } requires-python = ">=3.7" -dependencies = ["scikit_learn", "numpy", "scipy", "pandas"] +dependencies = ["scikit_learn", "tabulate", "numpy", "scipy", "pandas"] [project.optional-dependencies] molecules = ["aimsim"] diff --git a/test/regression/reference_splits/kmeans_reference.pkl b/test/regression/reference_splits/kmeans-v1.2.2_reference.pkl similarity index 100% rename from test/regression/reference_splits/kmeans_reference.pkl rename to test/regression/reference_splits/kmeans-v1.2.2_reference.pkl diff --git a/test/regression/reference_splits/kmeans-v1.3_reference.pkl b/test/regression/reference_splits/kmeans-v1.3_reference.pkl new file mode 100644 index 00000000..c601286a Binary files /dev/null and b/test/regression/reference_splits/kmeans-v1.3_reference.pkl differ diff --git a/test/regression/test_regression.py b/test/regression/test_regression.py index d6bc8e2d..b3f81866 100644 --- a/test/regression/test_regression.py +++ b/test/regression/test_regression.py @@ -4,6 +4,7 @@ from datetime import datetime import numpy as np +import pkg_resources from astartes import train_val_test_split from astartes.samplers import ( @@ -12,6 +13,10 @@ IMPLEMENTED_INTERPOLATION_SAMPLERS, ) +SKLEARN_GEQ_13 = ( # get the sklearn version + int(pkg_resources.get_distribution("scikit-learn").version.split(".")[1]) >= 3 +) + class Test_regression(unittest.TestCase): """ @@ -34,8 +39,20 @@ def setUpClass(self): self.reference_splits = { name: os.path.join(self.reference_splits_dir, name + "_reference.pkl") for name in ALL_SAMPLERS - if name not in ("scaffold",) + if name + not in ( + "scaffold", + "kmeans", + ) } + self.reference_splits["kmeans-v1.3"] = os.path.join( + self.reference_splits_dir, + "kmeans-v1.3_reference.pkl", + ) + self.reference_splits["kmeans-v1.2.2"] = os.path.join( + self.reference_splits_dir, + "kmeans-v1.2.2_reference.pkl", + ) def test_timebased_regression(self): """Regression test TimeBased, which has labels to check as well.""" @@ -94,7 +111,7 @@ def test_interpolation_regression(self): def test_extrapolation_regression(self): """Regression testing of extrapolative methods relative to static results.""" for sampler_name in IMPLEMENTED_EXTRAPOLATION_SAMPLERS: - if sampler_name in ("scaffold", "time_based"): + if sampler_name in ("scaffold", "time_based", "kmeans"): continue ( X_train, @@ -133,6 +150,90 @@ def test_extrapolation_regression(self): "Sampler {:s} failed regression testing.".format(sampler_name), ) + @unittest.skipUnless( + SKLEARN_GEQ_13, + "sklearn version less than 1.3 detected", + ) + def test_kmeans_regression_sklearn_v13(self): + """Regression testing of KMeans in sklearn v1.3 or newer.""" + ( + X_train, + X_val, + X_test, + y_train, + y_val, + y_test, + clusters_train, + clusters_val, + clusters_test, + ) = train_val_test_split( + self.X, + self.y, + sampler="kmeans", + random_state=42, + ) + all_output = [ + X_train, + X_val, + X_test, + y_train, + y_val, + y_test, + clusters_train, + clusters_val, + clusters_test, + ] + with open(self.reference_splits["kmeans-v1.3"], "rb") as f: + reference_output = pkl.load(f) + for i, j in zip(all_output, reference_output): + np.testing.assert_array_equal( + i, + j, + "Sampler kmeans failed regression testing.", + ) + + @unittest.skipIf( + SKLEARN_GEQ_13, + "sklearn version 1.3 or newer detected", + ) + def test_kmeans_regression_sklearn_v12(self): + """Regression testing of KMeans in sklearn v1.2 or earlier.""" + ( + X_train, + X_val, + X_test, + y_train, + y_val, + y_test, + clusters_train, + clusters_val, + clusters_test, + ) = train_val_test_split( + self.X, + self.y, + sampler="kmeans", + random_state=42, + ) + all_output = [ + X_train, + X_val, + X_test, + y_train, + y_val, + y_test, + clusters_train, + clusters_val, + clusters_test, + ] + with open(self.reference_splits["kmeans-v1.2.2"], "rb") as f: + reference_output = pkl.load(f) + for i, j in zip(all_output, reference_output): + np.testing.assert_array_equal( + i, + j, + "Sampler kmeans failed regression testing.", + ) + if __name__ == "__main__": unittest.main() diff --git a/test/unit/samplers/extrapolative/test_kmeans.py b/test/unit/samplers/extrapolative/test_kmeans.py index 947b441e..8ed97ab3 100644 --- a/test/unit/samplers/extrapolative/test_kmeans.py +++ b/test/unit/samplers/extrapolative/test_kmeans.py @@ -1,11 +1,16 @@ import unittest import numpy as np +import pkg_resources from astartes import train_test_split from astartes.samplers import KMeans from astartes.utils.warnings import ImperfectSplittingWarning +SKLEARN_GEQ_13 = ( # get the sklearn version + int(pkg_resources.get_distribution("scikit-learn").version.split(".")[1]) >= 3 +) + class Test_kmeans(unittest.TestCase): """ @@ -35,7 +40,11 @@ def setUpClass(self): ] ) - def test_kmeans_sampling(self): + @unittest.skipIf( + SKLEARN_GEQ_13, + "sklearn version 1.3 or newer detected", + ) + def test_kmeans_sampling_v12(self): """Use kmeans in the train_test_split and verify results.""" with self.assertWarns(ImperfectSplittingWarning): ( @@ -117,6 +126,92 @@ def test_kmeans_sampling(self): "Test clusters incorrect.", ) + @unittest.skipUnless( + SKLEARN_GEQ_13, + "sklearn version less than 1.3 detected", + ) + def test_kmeans_sampling_v13(self): + """Use kmeans in the train_test_split and verify results.""" + with self.assertWarns(ImperfectSplittingWarning): + ( + X_train, + X_test, + y_train, + y_test, + labels_train, + labels_test, + clusters_train, + clusters_test, + ) = train_test_split( + self.X, + self.y, + labels=self.labels, + test_size=0.75, + train_size=0.25, + sampler="kmeans", + random_state=42, + hopts={ + "n_clusters": 2, + }, + ) + # test that the known arrays equal the result from above + self.assertIsNone( + np.testing.assert_array_equal( + X_train, + np.array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 0, 0, 0]]), + ), + "Train X incorrect.", + ) + self.assertIsNone( + np.testing.assert_array_equal( + X_test, + np.array([[1, 1, 1, 0, 0], [1, 1, 1, 1, 0]]), + ), + "Test X incorrect.", + ) + self.assertIsNone( + np.testing.assert_array_equal( + y_train, + np.array([1, 2, 3]), + ), + "Train y incorrect.", + ) + self.assertIsNone( + np.testing.assert_array_equal( + y_test, + np.array([4, 5]), + ), + "Test y incorrect.", + ) + self.assertIsNone( + np.testing.assert_array_equal( + labels_train, + np.array(["one", "two", "three"]), + ), + "Train labels incorrect.", + ) + self.assertIsNone( + np.testing.assert_array_equal( + labels_test, + np.array(["four", "five"]), + ), + "Test labels incorrect.", + ) + self.assertIsNone( + np.testing.assert_array_equal( + clusters_train, + np.array([0, 0, 0]), + ), + "Train clusters incorrect.", + ) + self.assertIsNone( + np.testing.assert_array_equal( + clusters_test, + np.array([1, 1]), + ), + "Test clusters incorrect.", + ) + def test_kmeans(self): """Directly instantiate and test KMeans.""" kmeans_instance = KMeans(