From ec8401d3bbcb6a435af58101eccbfc489c37a4f2 Mon Sep 17 00:00:00 2001
From: Avik Basu <3485425+ab93@users.noreply.github.com>
Date: Fri, 3 Mar 2023 15:47:20 -0800
Subject: [PATCH] feat: tanh preprocessing (#139)

Signed-off-by: Avik Basu <ab93@users.noreply.github.com>
---
 docs/pre-processing.md               | 47 +++++++++++++++++++------
 numalogic/config/factory.py          |  3 +-
 numalogic/preprocess/__init__.py     |  4 +--
 numalogic/preprocess/transformer.py  | 51 ++++++++++++++++++++++++----
 pyproject.toml                       |  2 +-
 tests/preprocess/test_transformer.py | 21 ++++++++++--
 6 files changed, 106 insertions(+), 22 deletions(-)
diff --git a/docs/pre-processing.md b/docs/pre-processing.md
index cccfdb9a..21354132 100644
--- a/docs/pre-processing.md
+++ b/docs/pre-processing.md
@@ -17,17 +17,22 @@ Now, with `add_factor`, each data point x is converted to log(x + add_factor)
 Log transformation reduces the variance in some distributions, especially with large outliers.
 
 ```python
+import numpy as np
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import MinMaxScaler
 from numalogic.preprocess.transformer import LogTransformer
 
-transformer = LogTransformer(add_factor=1)
+# Generate some random train and test data
+x_train = np.random.randn(100, 3)
+x_test = np.random.randn(20, 3)
+
+transformer = LogTransformer(add_factor=4)
 scaler = MinMaxScaler()
 
 pipeline = make_pipeline(transformer, scaler)
 
-X_train = transformer.transform(train_df.to_numpy())
-X_test = scaler.transform(test_df.to_numpy())
+x_train_scaled = pipeline.fit_transform(x_train)
+X_test_scaled = pipeline.transform(x_test)
 ```
 
 ### Static Power Transformer
@@ -37,15 +42,37 @@ Static Power Transformer converts each data point x to x<sup>n</sup>.
 When `add_factor` is provided, each data point x is converted to (x + add_factor)<sup>n</sup>
 
 ```python
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import MinMaxScaler
+import numpy as np
 from numalogic.preprocess.transformer import StaticPowerTransformer
 
-transformer = StaticPowerTransformer(n=3, add_factor=2)
-scaler = MinMaxScaler()
+# Generate some random train and test data
+x_train = np.random.randn(100, 3)
+x_test = np.random.randn(20, 3)
 
-pipeline = make_pipeline(transformer, scaler)
+transformer = StaticPowerTransformer(n=3, add_factor=2)
 
-X_train = transformer.transform(train_df.to_numpy())
-X_test = scaler.transform(test_df.to_numpy())
+# Since this transformer is stateless, we can just call transform()
+x_train_scaled = transformer.transform(x_train)
+X_test_scaled = transformer.transform(x_test)
 ```
+
+### Tanh Scaler
+
+Tanh Scaler is a stateful estimator that applies tanh normalization to the Z-score,
+and scales the values between 0 and 1. 
+This scaler is seen to be more efficient as well as robust to the effect of outliers
+in the data. 
+
+```python
+import numpy as np
+from numalogic.preprocess import TanhScaler
+
+# Generate some random train and test data
+x_train = np.random.randn(100, 3)
+x_test = np.random.randn(20, 3)
+
+scaler = TanhScaler()
+
+x_train_scaled = scaler.fit_transform(x_train)
+x_test_scaled = scaler.transform(x_test)
+```
\ No newline at end of file
diff --git a/numalogic/config/factory.py b/numalogic/config/factory.py
index 0004425c..d468f2a2 100644
--- a/numalogic/config/factory.py
+++ b/numalogic/config/factory.py
@@ -25,7 +25,7 @@
 )
 from numalogic.models.threshold import StdDevThreshold, StaticThreshold
 from numalogic.postprocess import TanhNorm
-from numalogic.preprocess import LogTransformer, StaticPowerTransformer
+from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler
 from numalogic.tools.exceptions import UnknownConfigArgsError
 
 
@@ -54,6 +54,7 @@ class PreprocessFactory(_ObjectFactory):
         "RobustScaler": RobustScaler,
         "LogTransformer": LogTransformer,
         "StaticPowerTransformer": StaticPowerTransformer,
+        "TanhScaler": TanhScaler,
     }
 
 
diff --git a/numalogic/preprocess/__init__.py b/numalogic/preprocess/__init__.py
index 8eb1167d..5022141f 100644
--- a/numalogic/preprocess/__init__.py
+++ b/numalogic/preprocess/__init__.py
@@ -9,6 +9,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer
+from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer, TanhScaler
 
-__all__ = ["LogTransformer", "StaticPowerTransformer"]
+__all__ = ["LogTransformer", "StaticPowerTransformer", "TanhScaler"]
diff --git a/numalogic/preprocess/transformer.py b/numalogic/preprocess/transformer.py
index 4ab270a1..c2e79256 100644
--- a/numalogic/preprocess/transformer.py
+++ b/numalogic/preprocess/transformer.py
@@ -13,7 +13,10 @@
 import logging
 
 import numpy as np
+import numpy.typing as npt
 from numpy.typing import ArrayLike
+from sklearn.base import TransformerMixin, OneToOneFeatureMixin
+from typing_extensions import Self
 
 from numalogic.tools import DataIndependentTransformers
 
@@ -24,14 +27,14 @@ class LogTransformer(DataIndependentTransformers):
     def __init__(self, add_factor=2):
         self.add_factor = add_factor
 
-    def fit_transform(self, X, y=None, **fit_params):
-        return self.transform(X)
+    def fit_transform(self, x: npt.NDArray[float], y=None, **fit_params) -> npt.NDArray[float]:
+        return self.transform(x)
 
-    def transform(self, X):
-        return np.log(X + self.add_factor)
+    def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
+        return np.log(x + self.add_factor)
 
-    def inverse_transform(self, X) -> ArrayLike:
-        return np.exp(X) - self.add_factor
+    def inverse_transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
+        return np.exp(x) - self.add_factor
 
 
 class StaticPowerTransformer(DataIndependentTransformers):
@@ -47,3 +50,39 @@ def transform(self, X):
 
     def inverse_transform(self, X) -> ArrayLike:
         return np.power(X, 1.0 / self.n) - self.add_factor
+
+
+class TanhScaler(OneToOneFeatureMixin, TransformerMixin):
+    r"""
+    Tanh Estimator applies tanh normalization to the Z-score,
+    and scales the values between 0 and 1.
+
+    After scaling, the data has a mean of 0.5.
+
+    The coeff parameter determines the spread of the scores.
+    Higher the value, the linear portion of the curve will have a higher slope
+    but will reach the asymptote (flatten out) earlier.
+
+    References:
+        Nandakumar, Jain, Ross. 2005. Score Normalization in
+        Multimodal Biometric Systems, Pattern Recognition 38, 2270-2285.
+        https://web.cse.msu.edu/~rossarun/pubs/RossScoreNormalization_PR05.pdf
+    """
+    __slots__ = ("_coeff", "_std", "_mean")
+
+    def __init__(self, coeff: float = 0.2):
+        self._coeff = coeff
+        self._std = None
+        self._mean = None
+
+    def fit(self, x: npt.NDArray[float]) -> Self:
+        self._mean = np.mean(x, axis=0)
+        self._std = np.std(x, axis=0)
+        return self
+
+    def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
+        x_std_scaled = (x - self._mean) / self._std
+        return 0.5 * (np.tanh(self._coeff * x_std_scaled) + 1)
+
+    def fit_transform(self, x: npt.NDArray[float], y=None, **_) -> npt.NDArray[float]:
+        return self.fit(x).transform(x)
diff --git a/pyproject.toml b/pyproject.toml
index 633a92f5..58b74170 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "numalogic"
-version = "0.3.3"
+version = "0.3.4"
 description = "Collection of operational Machine Learning models and tools."
 authors = ["Numalogic Developers"]
 packages = [{ include = "numalogic" }]
diff --git a/tests/preprocess/test_transformer.py b/tests/preprocess/test_transformer.py
index 95d4af9e..ac1fe094 100644
--- a/tests/preprocess/test_transformer.py
+++ b/tests/preprocess/test_transformer.py
@@ -1,9 +1,10 @@
 import unittest
 
 import numpy as np
-from numpy.testing import assert_almost_equal
+from numpy.testing import assert_almost_equal, assert_array_less
+from sklearn.pipeline import make_pipeline
 
-from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer
+from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler
 
 
 class TestTransformers(unittest.TestCase):
@@ -25,6 +26,22 @@ def test_staticpowertransformer(self):
         assert_almost_equal(transformer.fit_transform(x), x_prime)
         assert_almost_equal(transformer.inverse_transform(x_prime), x, decimal=4)
 
+    def test_tanh_scaler_1(self):
+        x = 1 + np.random.randn(5, 3)
+        scaler = TanhScaler()
+        x_scaled = scaler.fit_transform(x)
+
+        assert_array_less(x_scaled, np.ones_like(x_scaled))
+        assert_array_less(np.zeros_like(x_scaled), x_scaled)
+
+    def test_tanh_scaler_2(self):
+        x = 1 + np.random.randn(5, 3)
+        pl = make_pipeline(LogTransformer(), TanhScaler())
+
+        x_scaled = pl.fit_transform(x)
+        assert_array_less(x_scaled, np.ones_like(x_scaled))
+        assert_array_less(np.zeros_like(x_scaled), x_scaled)
+
 
 if __name__ == "__main__":
     unittest.main()