diff --git a/rdt/transformers/categorical.py b/rdt/transformers/categorical.py index fef12424..13d4e038 100644 --- a/rdt/transformers/categorical.py +++ b/rdt/transformers/categorical.py @@ -9,7 +9,7 @@ from rdt.errors import TransformerInputError from rdt.transformers.base import BaseTransformer -from rdt.transformers.utils import fill_nan_with_none +from rdt.transformers.utils import check_nan_in_transform, fill_nan_with_none, try_convert_to_dtype LOGGER = logging.getLogger(__name__) @@ -177,6 +177,7 @@ def _reverse_transform(self, data): Returns: pandas.Series """ + check_nan_in_transform(data, self.dtype) data = data.clip(0, 1) bins = [0] labels = [] @@ -192,7 +193,10 @@ def _reverse_transform(self, data): labels.append(key) result = pd.cut(data, bins=bins, labels=labels, include_lowest=True) - return result.replace(nan_name, np.nan).astype(self.dtype) + result = result.replace(nan_name, np.nan) + result = try_convert_to_dtype(result, self.dtype) + + return result class OrderedUniformEncoder(UniformEncoder): @@ -333,6 +337,7 @@ def __init__(self, add_noise=False): ) super().__init__() self.add_noise = add_noise + self._is_integer = None @staticmethod def _get_intervals(data): @@ -516,6 +521,7 @@ def _reverse_transform(self, data): Returns: pandas.Series """ + check_nan_in_transform(data, self.dtype) data = data.clip(0, 1) num_rows = len(data) num_categories = len(self.means) @@ -545,6 +551,7 @@ class OneHotEncoder(BaseTransformer): _dummy_encoded = False _indexer = None _uniques = None + dtype = None @staticmethod def _prepare_data(data): @@ -582,6 +589,7 @@ def _fit(self, data): data (pandas.Series or pandas.DataFrame): Data to fit the transformer to. """ + self.dtype = data.dtype data = self._prepare_data(data) null = pd.isna(data).to_numpy() @@ -657,6 +665,7 @@ def _reverse_transform(self, data): Returns: pandas.Series """ + check_nan_in_transform(data, self.dtype) if not isinstance(data, np.ndarray): data = data.to_numpy() @@ -664,8 +673,10 @@ def _reverse_transform(self, data): data = data.reshape(-1, 1) indices = np.argmax(data, axis=1) + result = pd.Series(indices).map(self.dummies.__getitem__) + result = try_convert_to_dtype(result, self.dtype) - return pd.Series(indices).map(self.dummies.__getitem__) + return result class LabelEncoder(BaseTransformer): @@ -801,13 +812,15 @@ def _reverse_transform(self, data): Returns: pandas.Series """ + check_nan_in_transform(data, self.dtype) if self.add_noise: data = np.floor(data) data = data.clip(min(self.values_to_categories), max(self.values_to_categories)) data = data.round().map(self.values_to_categories) + data = try_convert_to_dtype(data, self.dtype) - return data.astype(self.dtype) + return data class OrderedLabelEncoder(LabelEncoder): @@ -865,6 +878,7 @@ def _fit(self, data): data (pandas.Series): Data to fit the transformer to. """ + self.dtype = data.dtype data = data.fillna(np.nan) missing = list(data[~data.isin(self.order)].unique()) if len(missing) > 0: diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py index 240b829a..4e78685a 100644 --- a/rdt/transformers/utils.py +++ b/rdt/transformers/utils.py @@ -2,8 +2,10 @@ import re import string +import warnings import numpy as np +import pandas as pd import sre_parse # isort:skip @@ -184,3 +186,51 @@ def flatten_column_list(column_list): flattened.append(column) return flattened + + +def check_nan_in_transform(data, dtype): + """Check if there are null values in the transformed data. + + Args: + data (pd.Series or numpy.ndarray): + Data that has been transformed. + dtype (str): + Data type of the transformed data. + """ + if pd.isna(data).any().any(): + message = ( + 'There are null values in the transformed data. The reversed ' + 'transformed data will contain null values' + ) + is_integer = pd.api.types.is_integer_dtype(dtype) + if is_integer: + message += " of type 'float'." + else: + message += '.' + + warnings.warn(message) + + +def try_convert_to_dtype(data, dtype): + """Try to convert data to a given dtype. + + Args: + data (pd.Series or numpy.ndarray): + Data to convert. + dtype (str): + Data type to convert to. + + Returns: + data: + Data converted to the given dtype. + """ + try: + data = data.astype(dtype) + except ValueError as error: + is_integer = pd.api.types.is_integer_dtype(dtype) + if is_integer: + data = data.astype(float) + else: + raise error + + return data diff --git a/tests/unit/transformers/test_categorical.py b/tests/unit/transformers/test_categorical.py index afd9d16a..dbad53a2 100644 --- a/tests/unit/transformers/test_categorical.py +++ b/tests/unit/transformers/test_categorical.py @@ -271,7 +271,9 @@ def test__transform_user_warning(self): assert transformed.iloc[4] >= 0 assert transformed.iloc[4] < 1 - def test__reverse_transform(self): + @patch('rdt.transformers.categorical.check_nan_in_transform') + @patch('rdt.transformers.categorical.try_convert_to_dtype') + def test__reverse_transform(self, mock_convert_dtype, mock_check_nan): """Test the ``_reverse_transform``.""" # Setup data = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2]) @@ -289,12 +291,18 @@ def test__reverse_transform(self): } transformed = pd.Series([0.12, 0.254, 0.789, 0.43, 0.56, 0.08, 0.67, 0.98, 0.36]) + mock_convert_dtype.return_value = pd.Series([1, 2, 3, 2, 2, 1, 3, 3, 2]) # Run output = transformer._reverse_transform(transformed) # Asserts pd.testing.assert_series_equal(output, data) + mock_input_data = mock_check_nan.call_args.args[0] + mock_input_dtype = mock_check_nan.call_args.args[1] + pd.testing.assert_series_equal(mock_input_data, transformed) + assert mock_input_dtype == transformer.dtype + mock_convert_dtype.assert_called_once() def test__reverse_transform_nans(self): """Test ``_reverse_transform`` for data with NaNs.""" @@ -323,6 +331,25 @@ def test__reverse_transform_nans(self): # Asserts pd.testing.assert_series_equal(output, data) + def test__reverse_transform_integer_and_nans(self): + """Test the ``reverse_transform`` method with integers and nans. + + Test that the method correctly reverse transforms the data + when the initial data is integers and the transformed data has nans. + """ + # Setup + transformer = UniformEncoder() + transformer.frequencies = {11: 0.2, 12: 0.3, 13: 0.5} + transformer.intervals = {11: [0, 0.2], 12: [0.2, 0.5], 13: [0.5, 1]} + transformer.dtype = np.int64 + data = pd.Series([0.1, 0.25, np.nan, 0.65]) + + # Run + out = transformer._reverse_transform(data) + + # Assert + pd.testing.assert_series_equal(out, pd.Series([11, 12, np.nan, 13])) + @pytest.fixture(autouse=True) def _setup_caplog(caplog): @@ -718,7 +745,8 @@ def test__get_value_add_noise_true(self, norm_mock): # Asserts assert result == 0.2745 - def test__reverse_transform_series(self): + @patch('rdt.transformers.categorical.check_nan_in_transform') + def test__reverse_transform_series(self, mock_check_nan): """Test reverse_transform a pandas Series""" # Setup data = pd.Series(['foo', 'bar', 'bar', 'foo', 'foo', 'tar']) @@ -730,6 +758,10 @@ def test__reverse_transform_series(self): result = transformer._reverse_transform(rt_data) # Asserts + mock_input_data = mock_check_nan.call_args.args[0] + mock_input_dtype = mock_check_nan.call_args.args[1] + pd.testing.assert_series_equal(mock_input_data, rt_data) + assert mock_input_dtype == transformer.dtype expected_intervals = { 'foo': ( 0, @@ -1123,7 +1155,8 @@ def test__reverse_transform_by_row_called(self): np.testing.assert_array_equal(reverse_arg, data.clip(0, 1)) assert reverse == categorical_transformer_mock._reverse_transform_by_row.return_value - def test__reverse_transform_by_row(self): + @patch('rdt.transformers.categorical.check_nan_in_transform') + def test__reverse_transform_by_row(self, mock_check_nan): """Test the _reverse_transform_by_row method with numerical data. Expect that the transformed data is correctly reverse transformed. @@ -1156,6 +1189,10 @@ def test__reverse_transform_by_row(self): reverse = transformer._reverse_transform(transformed) # Assert + mock_input_data = mock_check_nan.call_args.args[0] + mock_input_dtype = mock_check_nan.call_args.args[1] + pd.testing.assert_series_equal(mock_input_data, transformed) + assert mock_input_dtype == data.dtype pd.testing.assert_series_equal(data, reverse) @@ -1222,6 +1259,7 @@ def test__fit_dummies_no_nans(self): # Assert np.testing.assert_array_equal(ohe.dummies, ['a', 2, 'c']) + assert ohe.dtype == 'object' def test__fit_dummies_nans(self): """Test the ``_fit`` method without nans. @@ -1776,11 +1814,14 @@ def test__transform_numeric(self): assert not ohe._dummy_encoded np.testing.assert_array_equal(out, expected) - def test__reverse_transform_no_nans(self): + @patch('rdt.transformers.categorical.check_nan_in_transform') + @patch('rdt.transformers.categorical.try_convert_to_dtype') + def test__reverse_transform_no_nans(self, mock_convert_dtype, mock_check_nan): # Setup ohe = OneHotEncoder() data = pd.Series(['a', 'b', 'c']) ohe._fit(data) + mock_convert_dtype.return_value = data # Run transformed = np.array([ @@ -1793,6 +1834,11 @@ def test__reverse_transform_no_nans(self): # Assert expected = pd.Series(['a', 'b', 'c']) pd.testing.assert_series_equal(out, expected) + mock_input_data = mock_check_nan.call_args.args[0] + mock_input_dtype = mock_check_nan.call_args.args[1] + np.testing.assert_array_equal(mock_input_data, transformed) + assert mock_input_dtype == 'O' + mock_convert_dtype.assert_called_once() def test__reverse_transform_nans(self): # Setup @@ -2168,7 +2214,9 @@ def test__reverse_transform_clips_values(self): # Assert pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c'])) - def test__reverse_transform_add_noise(self): + @patch('rdt.transformers.categorical.check_nan_in_transform') + @patch('rdt.transformers.categorical.try_convert_to_dtype') + def test__reverse_transform_add_noise(self, mock_convert_dtype, mock_check_nan): """Test the ``_reverse_transform`` method with ``add_noise``. Test that the method correctly reverse transforms the data @@ -2183,12 +2231,36 @@ def test__reverse_transform_add_noise(self): transformer = LabelEncoder(add_noise=True) transformer.values_to_categories = {0: 'a', 1: 'b', 2: 'c'} data = pd.Series([0.5, 1.0, 10.9]) + mock_convert_dtype.return_value = pd.Series(['a', 'b', 'c']) # Run out = transformer._reverse_transform(data) # Assert pd.testing.assert_series_equal(out, pd.Series(['a', 'b', 'c'])) + mock_input_data = mock_check_nan.call_args.args[0] + mock_input_dtype = mock_check_nan.call_args.args[1] + pd.testing.assert_series_equal(mock_input_data, data) + assert mock_input_dtype == 'O' + mock_convert_dtype.assert_called_once() + + def test__reverse_transform_integer_and_nans(self): + """Test the ``reverse_transform`` method with integers and nans. + + Test that the method correctly reverse transforms the data + when the initial data is integers and the transformed data has nans. + """ + # Setup + transformer = LabelEncoder() + transformer.values_to_categories = {0: 11, 1: 12, 2: 13} + transformer.dtype = 'int' + data = pd.Series([0, 1, np.nan]) + + # Run + out = transformer._reverse_transform(data) + + # Assert + pd.testing.assert_series_equal(out, pd.Series([11, 12, np.nan])) class TestOrderedLabelEncoder: @@ -2272,6 +2344,7 @@ def test__fit(self): transformer._fit(data) # Assert + assert transformer.dtype == 'float' expected_values_to_categories = {0: 2, 1: 3, 2: np.nan, 3: 1} expected_categories_to_values = {2: 0, 3: 1, 1: 3, np.nan: 2} for key, value in transformer.values_to_categories.items(): diff --git a/tests/unit/transformers/test_utils.py b/tests/unit/transformers/test_utils.py index 475a32da..d9020232 100644 --- a/tests/unit/transformers/test_utils.py +++ b/tests/unit/transformers/test_utils.py @@ -1,7 +1,13 @@ import sre_parse from sre_constants import MAXREPEAT -from rdt.transformers.utils import _any, _max_repeat, flatten_column_list, strings_from_regex +import numpy as np +import pandas as pd +import pytest + +from rdt.transformers.utils import ( + _any, _max_repeat, check_nan_in_transform, flatten_column_list, strings_from_regex, + try_convert_to_dtype) def test_strings_from_regex_literal(): @@ -66,3 +72,58 @@ def test_flatten_column_list(): # Assert expected_flattened_list = ['column1', 'column2', 'column3', 'column4', 'column5', 'column6'] assert flattened_list == expected_flattened_list + + +def test_check_nan_in_transform(): + """Test ``check_nan_in_transform`` method. + + If there nan in the data, a warning should be raised. + If the data was integer, it should be converted to float. + """ + # Setup + transformed = pd.Series([0.1026, 0.1651, np.nan, 0.3116, 0.6546, 0.8541, 0.7041]) + data_without_nans = pd.DataFrame({ + 'col 1': [1, 2, 3], + 'col 2': [4, 5, 6], + }) + + # Run and Assert + check_nan_in_transform(data_without_nans, 'float') + expected_message = ( + 'There are null values in the transformed data. The reversed ' + 'transformed data will contain null values' + ) + expected_message_object = expected_message + '.' + expected_message_integer = expected_message + " of type 'float'." + with pytest.warns(UserWarning, match=expected_message_object): + check_nan_in_transform(transformed, 'object') + + with pytest.warns(UserWarning, match=expected_message_integer): + check_nan_in_transform(transformed, 'int') + + +def test_try_convert_to_dtype(): + """Test ``try_convert_to_dtype`` method. + + If the data can be converted to the specified dtype, it should be converted. + If the data cannot be converted, a ValueError should be raised. + Should allow to convert integer with NaNs to float. + """ + # Setup + data_int_with_nan = pd.Series([1.0, 2.0, np.nan, 4.0, 5.0]) + data_not_convertible = pd.Series(['a', 'b', 'c', 'd', 'e']) + + # Run + output_convertibe = try_convert_to_dtype(data_int_with_nan, 'str') + output_int_with_nan = try_convert_to_dtype(data_int_with_nan, 'int') + with pytest.raises(ValueError, match="could not convert string to float: 'a'"): + try_convert_to_dtype(data_not_convertible, 'int') + + with pytest.raises(ValueError, match="could not convert string to float: 'a'"): + try_convert_to_dtype(data_not_convertible, 'float') + + # Assert + expected_data_with_nan = pd.Series([1, 2, np.nan, 4, 5]) + expected_data_convertibe = pd.Series(['1.0', '2.0', 'nan', '4.0', '5.0']) + pd.testing.assert_series_equal(output_int_with_nan, expected_data_with_nan) + pd.testing.assert_series_equal(output_convertibe, expected_data_convertibe)