Skip to content

Commit

Permalink
feat: tanh preprocessing (#139)
Browse files Browse the repository at this point in the history
Signed-off-by: Avik Basu <[email protected]>
  • Loading branch information
ab93 authored Mar 3, 2023
1 parent 460ba53 commit ec8401d
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 22 deletions.
47 changes: 37 additions & 10 deletions docs/pre-processing.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,22 @@ Now, with `add_factor`, each data point x is converted to log(x + add_factor)
Log transformation reduces the variance in some distributions, especially with large outliers.

```python
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from numalogic.preprocess.transformer import LogTransformer

transformer = LogTransformer(add_factor=1)
# Generate some random train and test data
x_train = np.random.randn(100, 3)
x_test = np.random.randn(20, 3)

transformer = LogTransformer(add_factor=4)
scaler = MinMaxScaler()

pipeline = make_pipeline(transformer, scaler)

X_train = transformer.transform(train_df.to_numpy())
X_test = scaler.transform(test_df.to_numpy())
x_train_scaled = pipeline.fit_transform(x_train)
X_test_scaled = pipeline.transform(x_test)
```

### Static Power Transformer
Expand All @@ -37,15 +42,37 @@ Static Power Transformer converts each data point x to x<sup>n</sup>.
When `add_factor` is provided, each data point x is converted to (x + add_factor)<sup>n</sup>

```python
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from numalogic.preprocess.transformer import StaticPowerTransformer

transformer = StaticPowerTransformer(n=3, add_factor=2)
scaler = MinMaxScaler()
# Generate some random train and test data
x_train = np.random.randn(100, 3)
x_test = np.random.randn(20, 3)

pipeline = make_pipeline(transformer, scaler)
transformer = StaticPowerTransformer(n=3, add_factor=2)

X_train = transformer.transform(train_df.to_numpy())
X_test = scaler.transform(test_df.to_numpy())
# Since this transformer is stateless, we can just call transform()
x_train_scaled = transformer.transform(x_train)
X_test_scaled = transformer.transform(x_test)
```

### Tanh Scaler

Tanh Scaler is a stateful estimator that applies tanh normalization to the Z-score,
and scales the values between 0 and 1.
This scaler is seen to be more efficient as well as robust to the effect of outliers
in the data.

```python
import numpy as np
from numalogic.preprocess import TanhScaler

# Generate some random train and test data
x_train = np.random.randn(100, 3)
x_test = np.random.randn(20, 3)

scaler = TanhScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
```
3 changes: 2 additions & 1 deletion numalogic/config/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
)
from numalogic.models.threshold import StdDevThreshold, StaticThreshold
from numalogic.postprocess import TanhNorm
from numalogic.preprocess import LogTransformer, StaticPowerTransformer
from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler
from numalogic.tools.exceptions import UnknownConfigArgsError


Expand Down Expand Up @@ -54,6 +54,7 @@ class PreprocessFactory(_ObjectFactory):
"RobustScaler": RobustScaler,
"LogTransformer": LogTransformer,
"StaticPowerTransformer": StaticPowerTransformer,
"TanhScaler": TanhScaler,
}


Expand Down
4 changes: 2 additions & 2 deletions numalogic/preprocess/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer
from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer, TanhScaler

__all__ = ["LogTransformer", "StaticPowerTransformer"]
__all__ = ["LogTransformer", "StaticPowerTransformer", "TanhScaler"]
51 changes: 45 additions & 6 deletions numalogic/preprocess/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
import logging

import numpy as np
import numpy.typing as npt
from numpy.typing import ArrayLike
from sklearn.base import TransformerMixin, OneToOneFeatureMixin
from typing_extensions import Self

from numalogic.tools import DataIndependentTransformers

Expand All @@ -24,14 +27,14 @@ class LogTransformer(DataIndependentTransformers):
def __init__(self, add_factor=2):
self.add_factor = add_factor

def fit_transform(self, X, y=None, **fit_params):
return self.transform(X)
def fit_transform(self, x: npt.NDArray[float], y=None, **fit_params) -> npt.NDArray[float]:
return self.transform(x)

def transform(self, X):
return np.log(X + self.add_factor)
def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
return np.log(x + self.add_factor)

def inverse_transform(self, X) -> ArrayLike:
return np.exp(X) - self.add_factor
def inverse_transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
return np.exp(x) - self.add_factor


class StaticPowerTransformer(DataIndependentTransformers):
Expand All @@ -47,3 +50,39 @@ def transform(self, X):

def inverse_transform(self, X) -> ArrayLike:
return np.power(X, 1.0 / self.n) - self.add_factor


class TanhScaler(OneToOneFeatureMixin, TransformerMixin):
r"""
Tanh Estimator applies tanh normalization to the Z-score,
and scales the values between 0 and 1.
After scaling, the data has a mean of 0.5.
The coeff parameter determines the spread of the scores.
Higher the value, the linear portion of the curve will have a higher slope
but will reach the asymptote (flatten out) earlier.
References:
Nandakumar, Jain, Ross. 2005. Score Normalization in
Multimodal Biometric Systems, Pattern Recognition 38, 2270-2285.
https://web.cse.msu.edu/~rossarun/pubs/RossScoreNormalization_PR05.pdf
"""
__slots__ = ("_coeff", "_std", "_mean")

def __init__(self, coeff: float = 0.2):
self._coeff = coeff
self._std = None
self._mean = None

def fit(self, x: npt.NDArray[float]) -> Self:
self._mean = np.mean(x, axis=0)
self._std = np.std(x, axis=0)
return self

def transform(self, x: npt.NDArray[float]) -> npt.NDArray[float]:
x_std_scaled = (x - self._mean) / self._std
return 0.5 * (np.tanh(self._coeff * x_std_scaled) + 1)

def fit_transform(self, x: npt.NDArray[float], y=None, **_) -> npt.NDArray[float]:
return self.fit(x).transform(x)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "numalogic"
version = "0.3.3"
version = "0.3.4"
description = "Collection of operational Machine Learning models and tools."
authors = ["Numalogic Developers"]
packages = [{ include = "numalogic" }]
Expand Down
21 changes: 19 additions & 2 deletions tests/preprocess/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import unittest

import numpy as np
from numpy.testing import assert_almost_equal
from numpy.testing import assert_almost_equal, assert_array_less
from sklearn.pipeline import make_pipeline

from numalogic.preprocess.transformer import LogTransformer, StaticPowerTransformer
from numalogic.preprocess import LogTransformer, StaticPowerTransformer, TanhScaler


class TestTransformers(unittest.TestCase):
Expand All @@ -25,6 +26,22 @@ def test_staticpowertransformer(self):
assert_almost_equal(transformer.fit_transform(x), x_prime)
assert_almost_equal(transformer.inverse_transform(x_prime), x, decimal=4)

def test_tanh_scaler_1(self):
x = 1 + np.random.randn(5, 3)
scaler = TanhScaler()
x_scaled = scaler.fit_transform(x)

assert_array_less(x_scaled, np.ones_like(x_scaled))
assert_array_less(np.zeros_like(x_scaled), x_scaled)

def test_tanh_scaler_2(self):
x = 1 + np.random.randn(5, 3)
pl = make_pipeline(LogTransformer(), TanhScaler())

x_scaled = pl.fit_transform(x)
assert_array_less(x_scaled, np.ones_like(x_scaled))
assert_array_less(np.zeros_like(x_scaled), x_scaled)


if __name__ == "__main__":
unittest.main()

0 comments on commit ec8401d

Please sign in to comment.