Skip to content

Commit

Permalink
Add feature
Browse files Browse the repository at this point in the history
  • Loading branch information
fealho committed Feb 22, 2024
1 parent 7b7b8a1 commit d7022b2
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 23 deletions.
50 changes: 40 additions & 10 deletions rdt/transformers/pii/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,15 @@ class AnonymizedFaker(BaseTransformer):
Keyword args to pass into the ``function_name`` when being called.
locales (list):
List of localized providers to use instead of the global provider.
cardinality_rule (str):
If ``'unique'`` enforce that every created value is unique.
If ``'match'`` match the cardinality of the data seen during fit.
If ``None`` do not consider cardinality.
Defaults to ``None``.
enforce_uniqueness (bool):
Whether or not to ensure that the new anonymized data is all unique. If it isn't
possible to create the requested number of rows, then an error will be raised.
**DEPRECATED** Whether or not to ensure that the new anonymized data is all unique.
If it isn't possible to create the requested number of rows, then an error will be
raised.
Defaults to ``False``.
missing_value_generation (str or None):
The way missing values are being handled. There are two strategies:
Expand Down Expand Up @@ -98,10 +104,18 @@ def _check_locales(self):
)

def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
locales=None, enforce_uniqueness=False, missing_value_generation='random'):
locales=None, cardinality_rule=None, enforce_uniqueness=False,
missing_value_generation='random'):
super().__init__()
self._data_cardinality = None
self.data_length = None
self.enforce_uniqueness = enforce_uniqueness
self.cardinality_rule = cardinality_rule
if enforce_uniqueness:
warnings.warn(
"The 'enforce_uniqueness' parameter is no longer supported. "
"Please use the 'cardinality_rule' parameter instead.",
FutureWarning
)
self.provider_name = provider_name if provider_name else 'BaseProvider'
if self.provider_name != 'BaseProvider' and function_name is None:
raise TransformerInputError(
Expand Down Expand Up @@ -156,7 +170,11 @@ def reset_randomization(self):

def _function(self):
"""Return the result of calling the ``faker`` function."""
faker_attr = self.faker.unique if self.enforce_uniqueness else self.faker
if self.cardinality_rule in {'unique', 'match'}:
faker_attr = self.faker.unique
elif self.cardinality_rule is None:
faker_attr = self.faker

result = getattr(faker_attr, self.function_name)(**self.function_kwargs)

if isinstance(result, Iterable) and not isinstance(result, str):
Expand Down Expand Up @@ -185,6 +203,9 @@ def _fit(self, data):
if self.missing_value_generation == 'random':
self._nan_frequency = data.isna().sum() / len(data)

if self.cardinality_rule == 'match':
self._data_cardinality = len(data.unique())

def _transform(self, _data):
"""Drop the input column by returning ``None``."""
return None
Expand All @@ -205,10 +226,18 @@ def _reverse_transform(self, data):
sample_size = self.data_length

try:
reverse_transformed = np.array([
self._function()
for _ in range(sample_size)
], dtype=object)
if self.cardinality_rule == 'match':
unique_categories = np.array([
self._function()
for _ in range(self._data_cardinality)
], dtype=object)
reverse_transformed = np.random.choice(unique_categories, sample_size)

else:
reverse_transformed = np.array([
self._function()
for _ in range(sample_size)
], dtype=object)
except faker.exceptions.UniquenessException as exception:
raise TransformerProcessingError(
f'The Faker function you specified is not able to generate {sample_size} unique '
Expand All @@ -235,6 +264,7 @@ def __repr__(self):
args = inspect.getfullargspec(self.__init__)
keys = args.args[1:]
defaults = dict(zip(keys, args.defaults))
keys.remove('enforce_uniqueness')
instanced = {key: getattr(self, key) for key in keys}

defaults['function_name'] = None
Expand Down Expand Up @@ -283,7 +313,7 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
function_name=function_name,
function_kwargs=function_kwargs,
locales=locales,
enforce_uniqueness=True
cardinality_rule='unique'
)
self._mapping_dict = {}
self._reverse_mapping_dict = {}
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/transformers/pii/test_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_custom_provider_with_nans(self):
assert len(reverse_transform['cc']) == 5
assert reverse_transform['cc'].isna().sum() == 1

def test_enforce_uniqueness(self):
def test_cardinality_rule(self):
"""Test that ``AnonymizedFaker`` works with uniqueness.
Also ensure that when we call ``reset_randomization`` the generator will be able to
Expand All @@ -139,7 +139,7 @@ def test_enforce_uniqueness(self):
'job': np.arange(500)
})

instance = AnonymizedFaker('job', 'job', enforce_uniqueness=True)
instance = AnonymizedFaker('job', 'job', cardinality_rule='unique')
transformed = instance.fit_transform(data, 'job')
reverse_transform = instance.reverse_transform(transformed)

Expand Down
61 changes: 50 additions & 11 deletions tests/unit/transformers/pii/test_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,14 @@ def test_check_provider_function_raise_attribute_error(self):
with pytest.raises(TransformerProcessingError, match=expected_message):
AnonymizedFaker.check_provider_function('TestProvider', 'TestFunction')

def test__function_enforce_uniqueness_false(self):
def test__function_cardinality_rule_none(self):
"""Test that ``_function`` does not use ``faker.unique``.
The method ``_function`` should return a call from the
``instance.faker.provider.<function>``.
Mock:
- Instance of 'AnonymizedFaker'.
- Instance ``enforce_uniqueness`` set to `False`
- Faker instance.
- A function for the faker instance.
Expand All @@ -99,11 +98,11 @@ def test__function_enforce_uniqueness_false(self):
"""
# setup
instance = Mock()
instance.enforce_uniqueness = False
function = Mock()
unique_function = Mock()
function.return_value = 1

instance.cardinality_rule = None
instance.faker.unique.number = unique_function
instance.faker.number = function
instance.function_name = 'number'
Expand All @@ -117,15 +116,14 @@ def test__function_enforce_uniqueness_false(self):
function.assert_called_once_with(type='int')
assert result == 1

def test__function_enforce_uniqueness_true(self):
def test__function_cardinality_rule_unique(self):
"""Test that ``_function`` uses the ``faker.unique``.
The method ``_function`` should return a call from the
``instance.faker.unique.<function>``.
Mock:
- Instance of 'AnonymizedFaker'.
- Instance ``enforce_uniqueness`` set to ``True``
- Faker instance.
- A function for the faker instance.
Expand All @@ -138,11 +136,33 @@ def test__function_enforce_uniqueness_true(self):
"""
# setup
instance = Mock()
instance.enforce_uniqueness = True
function = Mock()
unique_function = Mock()
unique_function.return_value = 1

instance.cardinality_rule = 'unique'
instance.faker.unique.number = unique_function
instance.faker.number = function
instance.function_name = 'number'
instance.function_kwargs = {'type': 'int'}

# Run
result = AnonymizedFaker._function(instance)

# Assert
function.assert_not_called()
unique_function.assert_called_once_with(type='int')
assert result == 1

def test__function_cardinality_rule_match(self):
"""Test it when 'cardinality_rule' is 'match'."""
# setup
instance = Mock()
function = Mock()
unique_function = Mock()
unique_function.return_value = 1

instance.cardinality_rule = 'match'
instance.faker.unique.number = unique_function
instance.faker.number = function
instance.function_name = 'number'
Expand All @@ -160,7 +180,7 @@ def test__function_with_iterables_return(self):
"""Test that ``_function`` returns the values of the iterable."""
# setup
instance = Mock()
instance.enforce_uniqueness = False
instance.cardinality_rule = None
function = Mock()
function.return_value = ('value_1', 'value_2')

Expand Down Expand Up @@ -291,7 +311,7 @@ def test___init__default(self, mock_check_provider_function, mock_faker):
assert instance.function_kwargs == {}
assert instance.locales is None
mock_faker.Faker.assert_called_once_with(None)
assert instance.enforce_uniqueness is False
assert instance.cardinality_rule is None
assert instance.missing_value_generation == 'random'

def test___init__error_missing_value_generation(self):
Expand Down Expand Up @@ -342,7 +362,7 @@ def test___init__custom(self, mock_check_provider_function, mock_faker):
'type': 'visa'
},
locales=['en_US', 'fr_FR'],
enforce_uniqueness=True
cardinality_rule='match'
)

# Assert
Expand All @@ -352,7 +372,7 @@ def test___init__custom(self, mock_check_provider_function, mock_faker):
assert instance.function_kwargs == {'type': 'visa'}
assert instance.locales == ['en_US', 'fr_FR']
mock_faker.Faker.assert_called_once_with(['en_US', 'fr_FR'])
assert instance.enforce_uniqueness
assert instance.cardinality_rule == 'match'

def test___init__no_function_name(self):
"""Test the instantiation of the transformer with custom parameters.
Expand Down Expand Up @@ -503,6 +523,25 @@ def test__reverse_transform(self):
assert function.call_args_list == [call(), call(), call()]
np.testing.assert_array_equal(result, np.array(['a', 'b', 'c']))

def test__reverse_transform_cardinality_rule_match(self):
"""Test it when 'cardinality_rule' is 'match'."""
# Setup
instance = AnonymizedFaker()
instance.data_length = 3
instance.cardinality_rule = 'match'
instance._data_cardinality = 2
function = Mock()
function.side_effect = ['a', 'b', 'c']

instance._function = function

# Run
result = instance._reverse_transform(None)

# Assert
assert function.call_args_list == [call(), call()]
assert set(result).issubset({'a', 'b'})

def test__reverse_transform_with_nans(self):
"""Test that ``_reverse_transform`` generates NaNs."""
# Setup
Expand Down Expand Up @@ -539,7 +578,7 @@ def test__reverse_transform_not_enough_unique_values(self):
- Raises an error.
"""
# Setup
instance = AnonymizedFaker('misc', 'boolean', enforce_uniqueness=True)
instance = AnonymizedFaker('misc', 'boolean', cardinality_rule='unique')
data = pd.Series(['a', 'b', 'c', 'd'])
instance.columns = ['a']

Expand Down

0 comments on commit d7022b2

Please sign in to comment.