Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Categorical reverse transform may crash with ValueError for certain dtypes (int64) #755

Merged
merged 11 commits into from
Jan 22, 2024
22 changes: 18 additions & 4 deletions rdt/transformers/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from rdt.errors import TransformerInputError
from rdt.transformers.base import BaseTransformer
from rdt.transformers.utils import fill_nan_with_none
from rdt.transformers.utils import check_nan_in_transform, fill_nan_with_none, try_convert_to_dtype

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -177,6 +177,7 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
data = data.clip(0, 1)
bins = [0]
labels = []
Expand All @@ -192,7 +193,10 @@ def _reverse_transform(self, data):
labels.append(key)

result = pd.cut(data, bins=bins, labels=labels, include_lowest=True)
return result.replace(nan_name, np.nan).astype(self.dtype)
result = result.replace(nan_name, np.nan)
result = try_convert_to_dtype(result, self.dtype)

return result


class OrderedUniformEncoder(UniformEncoder):
Expand Down Expand Up @@ -333,6 +337,7 @@ def __init__(self, add_noise=False):
)
super().__init__()
self.add_noise = add_noise
self._is_integer = None

@staticmethod
def _get_intervals(data):
Expand Down Expand Up @@ -516,6 +521,7 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
data = data.clip(0, 1)
num_rows = len(data)
num_categories = len(self.means)
Expand Down Expand Up @@ -545,6 +551,7 @@ class OneHotEncoder(BaseTransformer):
_dummy_encoded = False
_indexer = None
_uniques = None
dtype = None

@staticmethod
def _prepare_data(data):
Expand Down Expand Up @@ -582,6 +589,7 @@ def _fit(self, data):
data (pandas.Series or pandas.DataFrame):
Data to fit the transformer to.
"""
self.dtype = data.dtype
data = self._prepare_data(data)

null = pd.isna(data).to_numpy()
Expand Down Expand Up @@ -657,15 +665,18 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if data.ndim == 1:
data = data.reshape(-1, 1)

indices = np.argmax(data, axis=1)
result = pd.Series(indices).map(self.dummies.__getitem__)
result = try_convert_to_dtype(result, self.dtype)

return pd.Series(indices).map(self.dummies.__getitem__)
return result


class LabelEncoder(BaseTransformer):
Expand Down Expand Up @@ -801,13 +812,15 @@ def _reverse_transform(self, data):
Returns:
pandas.Series
"""
check_nan_in_transform(data, self.dtype)
if self.add_noise:
data = np.floor(data)

data = data.clip(min(self.values_to_categories), max(self.values_to_categories))
data = data.round().map(self.values_to_categories)
data = try_convert_to_dtype(data, self.dtype)

return data.astype(self.dtype)
return data


class OrderedLabelEncoder(LabelEncoder):
Expand Down Expand Up @@ -865,6 +878,7 @@ def _fit(self, data):
data (pandas.Series):
Data to fit the transformer to.
"""
self.dtype = data.dtype
data = data.fillna(np.nan)
missing = list(data[~data.isin(self.order)].unique())
if len(missing) > 0:
Expand Down
50 changes: 50 additions & 0 deletions rdt/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import re
import string
import warnings

import numpy as np
import pandas as pd

import sre_parse # isort:skip

Expand Down Expand Up @@ -184,3 +186,51 @@ def flatten_column_list(column_list):
flattened.append(column)

return flattened


def check_nan_in_transform(data, dtype):
"""Check if there are null values in the transformed data.

Args:
data (pd.Series or numpy.ndarray):
Data that has been transformed.
dtype (str):
Data type of the transformed data.
"""
if pd.isna(data).any().any():
message = (
'There are null values in the transformed data. The reversed '
'transformed data will contain null values'
)
is_integer = pd.api.types.is_integer_dtype(dtype)
if is_integer:
message += " of type 'float'."
else:
message += '.'

warnings.warn(message)


def try_convert_to_dtype(data, dtype):
"""Try to convert data to a given dtype.

Args:
data (pd.Series or numpy.ndarray):
Data to convert.
dtype (str):
Data type to convert to.

Returns:
data:
Data converted to the given dtype.
"""
try:
data = data.astype(dtype)
except ValueError as error:
is_integer = pd.api.types.is_integer_dtype(dtype)
if is_integer:
data = data.astype(float)
else:
raise error

return data
83 changes: 78 additions & 5 deletions tests/unit/transformers/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,9 @@ def test__transform_user_warning(self):
assert transformed.iloc[4] >= 0
assert transformed.iloc[4] < 1

def test__reverse_transform(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
@patch('rdt.transformers.categorical.try_convert_to_dtype')
def test__reverse_transform(self, mock_convert_dtype, mock_check_nan):
"""Test the ``_reverse_transform``."""
# Setup
data = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2])
Expand All @@ -289,12 +291,18 @@ def test__reverse_transform(self):
}

transformed = pd.Series([0.12, 0.254, 0.789, 0.43, 0.56, 0.08, 0.67, 0.98, 0.36])
mock_convert_dtype.return_value = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2])

# Run
output = transformer._reverse_transform(transformed)

# Asserts
pd.testing.assert_series_equal(output, data)
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, transformed)
assert mock_input_dtype == transformer.dtype
mock_convert_dtype.assert_called_once()

def test__reverse_transform_nans(self):
"""Test ``_reverse_transform`` for data with NaNs."""
Expand Down Expand Up @@ -323,6 +331,25 @@ def test__reverse_transform_nans(self):
# Asserts
pd.testing.assert_series_equal(output, data)

def test__reverse_transform_integer_and_nans(self):
"""Test the ``reverse_transform`` method with integers and nans.

Test that the method correctly reverse transforms the data
when the initial data is integers and the transformed data has nans.
"""
# Setup
transformer = UniformEncoder()
transformer.frequencies = {11: 0.2, 12: 0.3, 13: 0.5}
transformer.intervals = {11: [0, 0.2], 12: [0.2, 0.5], 13: [0.5, 1]}
transformer.dtype = np.int64
data = pd.Series([0.1, 0.25, np.nan, 0.65])

# Run
out = transformer._reverse_transform(data)

# Assert
pd.testing.assert_series_equal(out, pd.Series([11, 12, np.nan, 13]))


@pytest.fixture(autouse=True)
def _setup_caplog(caplog):
Expand Down Expand Up @@ -718,7 +745,8 @@ def test__get_value_add_noise_true(self, norm_mock):
# Asserts
assert result == 0.2745

def test__reverse_transform_series(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
def test__reverse_transform_series(self, mock_check_nan):
"""Test reverse_transform a pandas Series"""
# Setup
data = pd.Series(['foo', 'bar', 'bar', 'foo', 'foo', 'tar'])
Expand All @@ -730,6 +758,10 @@ def test__reverse_transform_series(self):
result = transformer._reverse_transform(rt_data)

# Asserts
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, rt_data)
assert mock_input_dtype == transformer.dtype
expected_intervals = {
'foo': (
0,
Expand Down Expand Up @@ -1123,7 +1155,8 @@ def test__reverse_transform_by_row_called(self):
np.testing.assert_array_equal(reverse_arg, data.clip(0, 1))
assert reverse == categorical_transformer_mock._reverse_transform_by_row.return_value

def test__reverse_transform_by_row(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
def test__reverse_transform_by_row(self, mock_check_nan):
"""Test the _reverse_transform_by_row method with numerical data.

Expect that the transformed data is correctly reverse transformed.
Expand Down Expand Up @@ -1156,6 +1189,10 @@ def test__reverse_transform_by_row(self):
reverse = transformer._reverse_transform(transformed)

# Assert
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, transformed)
assert mock_input_dtype == data.dtype
pd.testing.assert_series_equal(data, reverse)


Expand Down Expand Up @@ -1222,6 +1259,7 @@ def test__fit_dummies_no_nans(self):

# Assert
np.testing.assert_array_equal(ohe.dummies, ['a', 2, 'c'])
assert ohe.dtype == 'object'

def test__fit_dummies_nans(self):
"""Test the ``_fit`` method without nans.
Expand Down Expand Up @@ -1776,11 +1814,14 @@ def test__transform_numeric(self):
assert not ohe._dummy_encoded
np.testing.assert_array_equal(out, expected)

def test__reverse_transform_no_nans(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
@patch('rdt.transformers.categorical.try_convert_to_dtype')
def test__reverse_transform_no_nans(self, mock_convert_dtype, mock_check_nan):
# Setup
ohe = OneHotEncoder()
data = pd.Series(['a', 'b', 'c'])
ohe._fit(data)
mock_convert_dtype.return_value = data

# Run
transformed = np.array([
Expand All @@ -1793,6 +1834,11 @@ def test__reverse_transform_no_nans(self):
# Assert
expected = pd.Series(['a', 'b', 'c'])
pd.testing.assert_series_equal(out, expected)
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
np.testing.assert_array_equal(mock_input_data, transformed)
assert mock_input_dtype == 'O'
mock_convert_dtype.assert_called_once()

def test__reverse_transform_nans(self):
# Setup
Expand Down Expand Up @@ -2168,7 +2214,9 @@ def test__reverse_transform_clips_values(self):
# Assert
pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c']))

def test__reverse_transform_add_noise(self):
@patch('rdt.transformers.categorical.check_nan_in_transform')
@patch('rdt.transformers.categorical.try_convert_to_dtype')
def test__reverse_transform_add_noise(self, mock_convert_dtype, mock_check_nan):
"""Test the ``_reverse_transform`` method with ``add_noise``.

Test that the method correctly reverse transforms the data
Expand All @@ -2183,12 +2231,36 @@ def test__reverse_transform_add_noise(self):
transformer = LabelEncoder(add_noise=True)
transformer.values_to_categories = {0: 'a', 1: 'b', 2: 'c'}
data = pd.Series([0.5, 1.0, 10.9])
mock_convert_dtype.return_value = pd.Series(['a', 'b', 'c'])

# Run
out = transformer._reverse_transform(data)

# Assert
pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c']))
mock_input_data = mock_check_nan.call_args.args[0]
mock_input_dtype = mock_check_nan.call_args.args[1]
pd.testing.assert_series_equal(mock_input_data, data)
assert mock_input_dtype == 'O'
mock_convert_dtype.assert_called_once()

def test__reverse_transform_integer_and_nans(self):
"""Test the ``reverse_transform`` method with integers and nans.

Test that the method correctly reverse transforms the data
when the initial data is integers and the transformed data has nans.
"""
# Setup
transformer = LabelEncoder()
transformer.values_to_categories = {0: 11, 1: 12, 2: 13}
transformer.dtype = 'int'
data = pd.Series([0, 1, np.nan])

# Run
out = transformer._reverse_transform(data)

# Assert
pd.testing.assert_series_equal(out, pd.Series([11, 12, np.nan]))


class TestOrderedLabelEncoder:
Expand Down Expand Up @@ -2272,6 +2344,7 @@ def test__fit(self):
transformer._fit(data)

# Assert
assert transformer.dtype == 'float'
expected_values_to_categories = {0: 2, 1: 3, 2: np.nan, 3: 1}
expected_categories_to_values = {2: 0, 3: 1, 1: 3, np.nan: 2}
for key, value in transformer.values_to_categories.items():
Expand Down
Loading
Loading