Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RegexGenerator gives a confusing message: # of possibilities are shown as an imaginary number #754

Merged
merged 2 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions rdt/transformers/text.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Transformers for text data."""
import warnings
import logging

import numpy as np
import pandas as pd
Expand All @@ -8,6 +8,8 @@
from rdt.transformers.base import BaseTransformer
from rdt.transformers.utils import strings_from_regex

LOGGER = logging.getLogger(__name__)


class IDGenerator(BaseTransformer):
"""Generate an ID column.
Expand Down Expand Up @@ -161,10 +163,10 @@ def _reverse_transform(self, data):
f"Please use a different regex for column ('{self.get_input_column()}')."
)

warnings.warn(
f"The data has {sample_size} rows but the regex for '{self.get_input_column()}' "
f'can only create {self.generator_size} unique values. Some values in '
f"'{self.get_input_column()}' may be repeated."
LOGGER.info(
"The data has %s rows but the regex for '%s' can only create %s unique values."
" Some values in '%s' may be repeated.",
sample_size, self.get_input_column(), self.generator_size, self.get_input_column()
)
Comment on lines +166 to 170
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could do:

message = (
    f"..."
    f"..."
)

But it is actually fine to have it as it is right now; The reason why, f-string gets 'compiled' always, meanwhile this way of string formatting gets generated only when called, which means that it will be generated only when logger is enabled.


remaining = self.generator_size - self.generated
Expand Down
2 changes: 1 addition & 1 deletion rdt/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def strings_from_regex(regex, max_repeat=16):
generators.append((generator, option, args))
sizes.append(size)

return _from_generators(generators, max_repeat), np.prod(sizes, dtype=np.complex128)
return _from_generators(generators, max_repeat), np.prod(sizes, dtype=np.complex128).real


def fill_nan_with_none(data):
Expand Down
28 changes: 28 additions & 0 deletions tests/unit/transformers/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,3 +485,31 @@ def test__reverse_transform_enforce_uniqueness_not_enough_remaining(self):
)
with pytest.raises(TransformerProcessingError, match=error_msg):
instance._reverse_transform(columns_data)

@patch('rdt.transformers.text.LOGGER')
def test__reverse_transform_info_message(self, mock_logger):
"""Test the ``_reverse_transform`` method.

Validate that the ``_reverse_transform`` method logs an info message when
``enforce_uniqueness`` is ``False`` and the ``instance.data_length`` is bigger than
``instance.generator_size``.
"""
# Setup
instance = RegexGenerator('[A-Z]', enforce_uniqueness=False)
instance.data_length = 6
instance.generator_size = 5
instance.generated = 0
instance.columns = ['a']
columns_data = pd.Series()

# Run
instance._reverse_transform(columns_data)

# Assert
expected_format = (
"The data has %s rows but the regex for '%s' can only create %s unique values. Some "
"values in '%s' may be repeated."
)
expected_args = (6, 'a', 5, 'a')

mock_logger.info.assert_called_once_with(expected_format, *expected_args)
Loading