diff --git a/docs/pre-processing.md b/docs/pre-processing.md
index cccfdb9a..21354132 100644
--- a/docs/pre-processing.md
+++ b/docs/pre-processing.md
@@ -17,17 +17,22 @@ Now, with `add_factor`, each data point x is converted to log(x + add_factor)
Log transformation reduces the variance in some distributions, especially with large outliers.
```python
+import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from numalogic.preprocess.transformer import LogTransformer
-transformer = LogTransformer(add_factor=1)
+# Generate some random train and test data
+x_train = np.random.randn(100, 3)
+x_test = np.random.randn(20, 3)
+
+transformer = LogTransformer(add_factor=4)
scaler = MinMaxScaler()
pipeline = make_pipeline(transformer, scaler)
-X_train = transformer.transform(train_df.to_numpy())
-X_test = scaler.transform(test_df.to_numpy())
+x_train_scaled = pipeline.fit_transform(x_train)
+X_test_scaled = pipeline.transform(x_test)
```
### Static Power Transformer
@@ -37,15 +42,37 @@ Static Power Transformer converts each data point x to xn.
When `add_factor` is provided, each data point x is converted to (x + add_factor)n
```python
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import MinMaxScaler
+import numpy as np
from numalogic.preprocess.transformer import StaticPowerTransformer
-transformer = StaticPowerTransformer(n=3, add_factor=2)
-scaler = MinMaxScaler()
+# Generate some random train and test data
+x_train = np.random.randn(100, 3)
+x_test = np.random.randn(20, 3)
-pipeline = make_pipeline(transformer, scaler)
+transformer = StaticPowerTransformer(n=3, add_factor=2)
-X_train = transformer.transform(train_df.to_numpy())
-X_test = scaler.transform(test_df.to_numpy())
+# Since this transformer is stateless, we can just call transform()
+x_train_scaled = transformer.transform(x_train)
+X_test_scaled = transformer.transform(x_test)
```
+
+### Tanh Scaler
+
+Tanh Scaler is a stateful estimator that applies tanh normalization to the Z-score,
+and scales the values between 0 and 1.
+This scaler is seen to be more efficient as well as robust to the effect of outliers
+in the data.
+
+```python
+import numpy as np
+from numalogic.preprocess import TanhScaler
+
+# Generate some random train and test data
+x_train = np.random.randn(100, 3)
+x_test = np.random.randn(20, 3)
+
+scaler = TanhScaler()
+
+x_train_scaled = scaler.fit_transform(x_train)
+x_test_scaled = scaler.transform(x_test)
+```
\ No newline at end of file
diff --git a/numalogic/config/factory.py b/numalogic/config/factory.py
index 0004425c..d468f2a2 100644
--- a/numalogic/config/factory.py
+++ b/numalogic/config/factory.py
@@ -25,7 +25,7 @@
)
from numalogic.models.threshold import StdDevThreshold, StaticThreshold
from numalogic.postprocess import TanhNorm
-from numalogic.preprocess import LogTransformer, StaticPowerTransformer
+from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler
from numalogic.tools.exceptions import UnknownConfigArgsError
@@ -54,6 +54,7 @@ class PreprocessFactory(_ObjectFactory):
"RobustScaler": RobustScaler,
"LogTransformer": LogTransformer,
"StaticPowerTransformer": StaticPowerTransformer,
+ "TanhScaler": TanhScaler,
}
diff --git a/numalogic/preprocess/__init__.py b/numalogic/preprocess/__init__.py
index 8eb1167d..5022141f 100644
--- a/numalogic/preprocess/__init__.py
+++ b/numalogic/preprocess/__init__.py
@@ -9,6 +9,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer
+from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer, TanhScaler
-__all__ = ["LogTransformer", "StaticPowerTransformer"]
+__all__ = ["LogTransformer", "StaticPowerTransformer", "TanhScaler"]
diff --git a/numalogic/preprocess/transformer.py b/numalogic/preprocess/transformer.py
index 4ab270a1..c2e79256 100644
--- a/numalogic/preprocess/transformer.py
+++ b/numalogic/preprocess/transformer.py
@@ -13,7 +13,10 @@
import logging
import numpy as np
+import numpy.typing as npt
from numpy.typing import ArrayLike
+from sklearn.base import TransformerMixin, OneToOneFeatureMixin
+from typing_extensions import Self
from numalogic.tools import DataIndependentTransformers
@@ -24,14 +27,14 @@ class LogTransformer(DataIndependentTransformers):
def __init__(self, add_factor=2):
self.add_factor = add_factor
- def fit_transform(self, X, y=None, **fit_params):
- return self.transform(X)
+ def fit_transform(self, x: npt.NDArray[float], y=None, **fit_params) -> npt.NDArray[float]:
+ return self.transform(x)
- def transform(self, X):
- return np.log(X + self.add_factor)
+ def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
+ return np.log(x + self.add_factor)
- def inverse_transform(self, X) -> ArrayLike:
- return np.exp(X) - self.add_factor
+ def inverse_transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
+ return np.exp(x) - self.add_factor
class StaticPowerTransformer(DataIndependentTransformers):
@@ -47,3 +50,39 @@ def transform(self, X):
def inverse_transform(self, X) -> ArrayLike:
return np.power(X, 1.0 / self.n) - self.add_factor
+
+
+class TanhScaler(OneToOneFeatureMixin, TransformerMixin):
+ r"""
+ Tanh Estimator applies tanh normalization to the Z-score,
+ and scales the values between 0 and 1.
+
+ After scaling, the data has a mean of 0.5.
+
+ The coeff parameter determines the spread of the scores.
+ Higher the value, the linear portion of the curve will have a higher slope
+ but will reach the asymptote (flatten out) earlier.
+
+ References:
+ Nandakumar, Jain, Ross. 2005. Score Normalization in
+ Multimodal Biometric Systems, Pattern Recognition 38, 2270-2285.
+ https://web.cse.msu.edu/~rossarun/pubs/RossScoreNormalization_PR05.pdf
+ """
+ __slots__ = ("_coeff", "_std", "_mean")
+
+ def __init__(self, coeff: float = 0.2):
+ self._coeff = coeff
+ self._std = None
+ self._mean = None
+
+ def fit(self, x: npt.NDArray[float]) -> Self:
+ self._mean = np.mean(x, axis=0)
+ self._std = np.std(x, axis=0)
+ return self
+
+ def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
+ x_std_scaled = (x - self._mean) / self._std
+ return 0.5 * (np.tanh(self._coeff * x_std_scaled) + 1)
+
+ def fit_transform(self, x: npt.NDArray[float], y=None, **_) -> npt.NDArray[float]:
+ return self.fit(x).transform(x)
diff --git a/pyproject.toml b/pyproject.toml
index 633a92f5..58b74170 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "numalogic"
-version = "0.3.3"
+version = "0.3.4"
description = "Collection of operational Machine Learning models and tools."
authors = ["Numalogic Developers"]
packages = [{ include = "numalogic" }]
diff --git a/tests/preprocess/test_transformer.py b/tests/preprocess/test_transformer.py
index 95d4af9e..ac1fe094 100644
--- a/tests/preprocess/test_transformer.py
+++ b/tests/preprocess/test_transformer.py
@@ -1,9 +1,10 @@
import unittest
import numpy as np
-from numpy.testing import assert_almost_equal
+from numpy.testing import assert_almost_equal, assert_array_less
+from sklearn.pipeline import make_pipeline
-from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer
+from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler
class TestTransformers(unittest.TestCase):
@@ -25,6 +26,22 @@ def test_staticpowertransformer(self):
assert_almost_equal(transformer.fit_transform(x), x_prime)
assert_almost_equal(transformer.inverse_transform(x_prime), x, decimal=4)
+ def test_tanh_scaler_1(self):
+ x = 1 + np.random.randn(5, 3)
+ scaler = TanhScaler()
+ x_scaled = scaler.fit_transform(x)
+
+ assert_array_less(x_scaled, np.ones_like(x_scaled))
+ assert_array_less(np.zeros_like(x_scaled), x_scaled)
+
+ def test_tanh_scaler_2(self):
+ x = 1 + np.random.randn(5, 3)
+ pl = make_pipeline(LogTransformer(), TanhScaler())
+
+ x_scaled = pl.fit_transform(x)
+ assert_array_less(x_scaled, np.ones_like(x_scaled))
+ assert_array_less(np.zeros_like(x_scaled), x_scaled)
+
if __name__ == "__main__":
unittest.main()