diff --git a/.safety-policy.yml b/.safety-policy.yml index ab621b7a..95048fe6 100644 --- a/.safety-policy.yml +++ b/.safety-policy.yml @@ -9,4 +9,6 @@ security: ignore-vulnerabilities: 66947: reason: Not used + 70612: + reason: Only used during documentation generation continue-on-vulnerability-error: False diff --git a/mleko/dataset/transform/label_encoder_transformer.py b/mleko/dataset/transform/label_encoder_transformer.py index d5fa6117..d2f85319 100644 --- a/mleko/dataset/transform/label_encoder_transformer.py +++ b/mleko/dataset/transform/label_encoder_transformer.py @@ -145,7 +145,12 @@ def _fit( logger.info(f"Fitting label encoder transformer ({len(self._features)}): {self._features}.") for feature in self._features: self._ensure_valid_feature_type(feature, data_schema, dataframe) - labels: list[str] = get_column(dataframe, feature).unique(dropna=True) # type: ignore + labels: list[str] = [ + label + for label in get_column(dataframe, feature).to_arrow().unique().to_pylist() # type: ignore + if label is not None + ] + if not self._fit_using_label_dict(feature, labels): logger.info(f"Assigning mappings for feature {feature!r}: {labels}.") self._transformer[feature] = {label: i for i, label in enumerate(labels)} diff --git a/mleko/model/lgbm_model.py b/mleko/model/lgbm_model.py index e7b1d7e4..174403ac 100644 --- a/mleko/model/lgbm_model.py +++ b/mleko/model/lgbm_model.py @@ -12,6 +12,7 @@ import pandas as pd import vaex from lightgbm.sklearn import _LGBM_ScikitEvalMetricType +from sklearn.utils.validation import NotFittedError, check_is_fitted from mleko.dataset.data_schema import DataSchema from mleko.utils.custom_logger import CustomLogger @@ -246,7 +247,18 @@ def _fingerprint(self) -> Hashable: Returns: The fingerprint of the model. """ - return (super()._fingerprint(), self._target, self._model.__class__.__qualname__) + is_fitted = True + try: + check_is_fitted(self._model) # type: ignore + except NotFittedError: + is_fitted = False + + return ( + super()._fingerprint(), + self._target, + self._model.__class__.__qualname__, + self._model.booster_.model_to_string() if is_fitted else None, + ) def _default_features(self, data_schema: DataSchema) -> tuple[str, ...]: """The default set of features to use for training. diff --git a/mleko/model/tune/optuna_tuner.py b/mleko/model/tune/optuna_tuner.py index 2c9120ab..bd149fe1 100644 --- a/mleko/model/tune/optuna_tuner.py +++ b/mleko/model/tune/optuna_tuner.py @@ -339,6 +339,7 @@ def _fingerprint(self) -> Hashable: CallableSourceFingerprinter().fingerprint(self._objective_function), self._direction, self._num_trials, + JsonFingerprinter().fingerprint(self._enqueue_trials) if self._enqueue_trials is not None else None, CallableSourceFingerprinter().fingerprint(self._cv_folds) if self._cv_folds is not None else None, OptunaSamplerFingerprinter().fingerprint(self._sampler), OptunaPrunerFingerprinter().fingerprint(self._pruner), diff --git a/poetry.lock b/poetry.lock index 6cab7fba..e69282cd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "alabaster" @@ -4050,7 +4050,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -4058,16 +4057,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -4084,7 +4075,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -4092,7 +4082,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -4832,7 +4821,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""} typing-extensions = ">=4.6.0" [package.extras] diff --git a/tests/model/test_lgbm_model.py b/tests/model/test_lgbm_model.py index 56cf736f..de2f537f 100644 --- a/tests/model/test_lgbm_model.py +++ b/tests/model/test_lgbm_model.py @@ -119,6 +119,24 @@ def test_cache_fit_transform_no_validation( ).fit_transform(example_data_schema, example_vaex_dataframe_train, None, {}) mocked_fit_transform.assert_not_called() + def test_chache_fit_transform_with_refit( + self, + temporary_directory: Path, + example_data_schema: DataSchema, + example_vaex_dataframe_train: vaex.DataFrame, + ): + """Should train the model using fit_transform and use the cache once called again with refit.""" + lgbm_model = LGBMModel( + cache_directory=temporary_directory, target="target", model=lgb.LGBMClassifier(objective="binary") + ) + lgbm_model.fit(example_data_schema, example_vaex_dataframe_train.copy()) + lgbm_model.transform(example_data_schema, example_vaex_dataframe_train.copy()) + lgbm_model.fit(example_data_schema, example_vaex_dataframe_train.copy(), hyperparameters={"num_leaves": 10}) + + with patch.object(LGBMModel, "_transform") as mocked_transform: + lgbm_model.transform(example_data_schema, example_vaex_dataframe_train.copy()) + mocked_transform.assert_called() + def test_cache_fit_and_transform( self, temporary_directory: Path,