diff --git a/docs/pre-processing.md b/docs/pre-processing.md index cccfdb9a..21354132 100644 --- a/docs/pre-processing.md +++ b/docs/pre-processing.md @@ -17,17 +17,22 @@ Now, with `add_factor`, each data point x is converted to log(x + add_factor) Log transformation reduces the variance in some distributions, especially with large outliers. ```python +import numpy as np from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MinMaxScaler from numalogic.preprocess.transformer import LogTransformer -transformer = LogTransformer(add_factor=1) +# Generate some random train and test data +x_train = np.random.randn(100, 3) +x_test = np.random.randn(20, 3) + +transformer = LogTransformer(add_factor=4) scaler = MinMaxScaler() pipeline = make_pipeline(transformer, scaler) -X_train = transformer.transform(train_df.to_numpy()) -X_test = scaler.transform(test_df.to_numpy()) +x_train_scaled = pipeline.fit_transform(x_train) +X_test_scaled = pipeline.transform(x_test) ``` ### Static Power Transformer @@ -37,15 +42,37 @@ Static Power Transformer converts each data point x to xn. When `add_factor` is provided, each data point x is converted to (x + add_factor)n ```python -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import MinMaxScaler +import numpy as np from numalogic.preprocess.transformer import StaticPowerTransformer -transformer = StaticPowerTransformer(n=3, add_factor=2) -scaler = MinMaxScaler() +# Generate some random train and test data +x_train = np.random.randn(100, 3) +x_test = np.random.randn(20, 3) -pipeline = make_pipeline(transformer, scaler) +transformer = StaticPowerTransformer(n=3, add_factor=2) -X_train = transformer.transform(train_df.to_numpy()) -X_test = scaler.transform(test_df.to_numpy()) +# Since this transformer is stateless, we can just call transform() +x_train_scaled = transformer.transform(x_train) +X_test_scaled = transformer.transform(x_test) ``` + +### Tanh Scaler + +Tanh Scaler is a stateful estimator that applies tanh normalization to the Z-score, +and scales the values between 0 and 1. +This scaler is seen to be more efficient as well as robust to the effect of outliers +in the data. + +```python +import numpy as np +from numalogic.preprocess import TanhScaler + +# Generate some random train and test data +x_train = np.random.randn(100, 3) +x_test = np.random.randn(20, 3) + +scaler = TanhScaler() + +x_train_scaled = scaler.fit_transform(x_train) +x_test_scaled = scaler.transform(x_test) +``` \ No newline at end of file diff --git a/numalogic/config/factory.py b/numalogic/config/factory.py index 0004425c..d468f2a2 100644 --- a/numalogic/config/factory.py +++ b/numalogic/config/factory.py @@ -25,7 +25,7 @@ ) from numalogic.models.threshold import StdDevThreshold, StaticThreshold from numalogic.postprocess import TanhNorm -from numalogic.preprocess import LogTransformer, StaticPowerTransformer +from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler from numalogic.tools.exceptions import UnknownConfigArgsError @@ -54,6 +54,7 @@ class PreprocessFactory(_ObjectFactory): "RobustScaler": RobustScaler, "LogTransformer": LogTransformer, "StaticPowerTransformer": StaticPowerTransformer, + "TanhScaler": TanhScaler, } diff --git a/numalogic/preprocess/__init__.py b/numalogic/preprocess/__init__.py index 8eb1167d..5022141f 100644 --- a/numalogic/preprocess/__init__.py +++ b/numalogic/preprocess/__init__.py @@ -9,6 +9,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer +from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer, TanhScaler -__all__ = ["LogTransformer", "StaticPowerTransformer"] +__all__ = ["LogTransformer", "StaticPowerTransformer", "TanhScaler"] diff --git a/numalogic/preprocess/transformer.py b/numalogic/preprocess/transformer.py index 4ab270a1..c2e79256 100644 --- a/numalogic/preprocess/transformer.py +++ b/numalogic/preprocess/transformer.py @@ -13,7 +13,10 @@ import logging import numpy as np +import numpy.typing as npt from numpy.typing import ArrayLike +from sklearn.base import TransformerMixin, OneToOneFeatureMixin +from typing_extensions import Self from numalogic.tools import DataIndependentTransformers @@ -24,14 +27,14 @@ class LogTransformer(DataIndependentTransformers): def __init__(self, add_factor=2): self.add_factor = add_factor - def fit_transform(self, X, y=None, **fit_params): - return self.transform(X) + def fit_transform(self, x: npt.NDArray[float], y=None, **fit_params) -> npt.NDArray[float]: + return self.transform(x) - def transform(self, X): - return np.log(X + self.add_factor) + def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]: + return np.log(x + self.add_factor) - def inverse_transform(self, X) -> ArrayLike: - return np.exp(X) - self.add_factor + def inverse_transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]: + return np.exp(x) - self.add_factor class StaticPowerTransformer(DataIndependentTransformers): @@ -47,3 +50,39 @@ def transform(self, X): def inverse_transform(self, X) -> ArrayLike: return np.power(X, 1.0 / self.n) - self.add_factor + + +class TanhScaler(OneToOneFeatureMixin, TransformerMixin): + r""" + Tanh Estimator applies tanh normalization to the Z-score, + and scales the values between 0 and 1. + + After scaling, the data has a mean of 0.5. + + The coeff parameter determines the spread of the scores. + Higher the value, the linear portion of the curve will have a higher slope + but will reach the asymptote (flatten out) earlier. + + References: + Nandakumar, Jain, Ross. 2005. Score Normalization in + Multimodal Biometric Systems, Pattern Recognition 38, 2270-2285. + https://web.cse.msu.edu/~rossarun/pubs/RossScoreNormalization_PR05.pdf + """ + __slots__ = ("_coeff", "_std", "_mean") + + def __init__(self, coeff: float = 0.2): + self._coeff = coeff + self._std = None + self._mean = None + + def fit(self, x: npt.NDArray[float]) -> Self: + self._mean = np.mean(x, axis=0) + self._std = np.std(x, axis=0) + return self + + def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]: + x_std_scaled = (x - self._mean) / self._std + return 0.5 * (np.tanh(self._coeff * x_std_scaled) + 1) + + def fit_transform(self, x: npt.NDArray[float], y=None, **_) -> npt.NDArray[float]: + return self.fit(x).transform(x) diff --git a/pyproject.toml b/pyproject.toml index 633a92f5..58b74170 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "numalogic" -version = "0.3.3" +version = "0.3.4" description = "Collection of operational Machine Learning models and tools." authors = ["Numalogic Developers"] packages = [{ include = "numalogic" }] diff --git a/tests/preprocess/test_transformer.py b/tests/preprocess/test_transformer.py index 95d4af9e..ac1fe094 100644 --- a/tests/preprocess/test_transformer.py +++ b/tests/preprocess/test_transformer.py @@ -1,9 +1,10 @@ import unittest import numpy as np -from numpy.testing import assert_almost_equal +from numpy.testing import assert_almost_equal, assert_array_less +from sklearn.pipeline import make_pipeline -from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer +from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler class TestTransformers(unittest.TestCase): @@ -25,6 +26,22 @@ def test_staticpowertransformer(self): assert_almost_equal(transformer.fit_transform(x), x_prime) assert_almost_equal(transformer.inverse_transform(x_prime), x, decimal=4) + def test_tanh_scaler_1(self): + x = 1 + np.random.randn(5, 3) + scaler = TanhScaler() + x_scaled = scaler.fit_transform(x) + + assert_array_less(x_scaled, np.ones_like(x_scaled)) + assert_array_less(np.zeros_like(x_scaled), x_scaled) + + def test_tanh_scaler_2(self): + x = 1 + np.random.randn(5, 3) + pl = make_pipeline(LogTransformer(), TanhScaler()) + + x_scaled = pl.fit_transform(x) + assert_array_less(x_scaled, np.ones_like(x_scaled)) + assert_array_less(np.zeros_like(x_scaled), x_scaled) + if __name__ == "__main__": unittest.main()