diff --git a/rdt/transformers/text.py b/rdt/transformers/text.py index 7cae7a29..ca1887a1 100644 --- a/rdt/transformers/text.py +++ b/rdt/transformers/text.py @@ -1,5 +1,5 @@ """Transformers for text data.""" -import warnings +import logging import numpy as np import pandas as pd @@ -8,6 +8,8 @@ from rdt.transformers.base import BaseTransformer from rdt.transformers.utils import strings_from_regex +LOGGER = logging.getLogger(__name__) + class IDGenerator(BaseTransformer): """Generate an ID column. @@ -161,10 +163,10 @@ def _reverse_transform(self, data): f"Please use a different regex for column ('{self.get_input_column()}')." ) - warnings.warn( - f"The data has {sample_size} rows but the regex for '{self.get_input_column()}' " - f'can only create {self.generator_size} unique values. Some values in ' - f"'{self.get_input_column()}' may be repeated." + LOGGER.info( + "The data has %s rows but the regex for '%s' can only create %s unique values." + " Some values in '%s' may be repeated.", + sample_size, self.get_input_column(), self.generator_size, self.get_input_column() ) remaining = self.generator_size - self.generated diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py index fe264764..240b829a 100644 --- a/rdt/transformers/utils.py +++ b/rdt/transformers/utils.py @@ -149,7 +149,7 @@ def strings_from_regex(regex, max_repeat=16): generators.append((generator, option, args)) sizes.append(size) - return _from_generators(generators, max_repeat), np.prod(sizes, dtype=np.complex128) + return _from_generators(generators, max_repeat), np.prod(sizes, dtype=np.complex128).real def fill_nan_with_none(data): diff --git a/tests/unit/transformers/test_text.py b/tests/unit/transformers/test_text.py index 511c351c..ff208b2d 100644 --- a/tests/unit/transformers/test_text.py +++ b/tests/unit/transformers/test_text.py @@ -485,3 +485,31 @@ def test__reverse_transform_enforce_uniqueness_not_enough_remaining(self): ) with pytest.raises(TransformerProcessingError, match=error_msg): instance._reverse_transform(columns_data) + + @patch('rdt.transformers.text.LOGGER') + def test__reverse_transform_info_message(self, mock_logger): + """Test the ``_reverse_transform`` method. + + Validate that the ``_reverse_transform`` method logs an info message when + ``enforce_uniqueness`` is ``False`` and the ``instance.data_length`` is bigger than + ``instance.generator_size``. + """ + # Setup + instance = RegexGenerator('[A-Z]', enforce_uniqueness=False) + instance.data_length = 6 + instance.generator_size = 5 + instance.generated = 0 + instance.columns = ['a'] + columns_data = pd.Series() + + # Run + instance._reverse_transform(columns_data) + + # Assert + expected_format = ( + "The data has %s rows but the regex for '%s' can only create %s unique values. Some " + "values in '%s' may be repeated." + ) + expected_args = (6, 'a', 5, 'a') + + mock_logger.info.assert_called_once_with(expected_format, *expected_args)