Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cardinality_rule parameter to AnonymizedFaker #762

Merged
merged 13 commits into from
Feb 27, 2024
83 changes: 72 additions & 11 deletions rdt/transformers/pii/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import faker
import numpy as np
import pandas as pd

from rdt.errors import TransformerInputError, TransformerProcessingError
from rdt.transformers.base import BaseTransformer
Expand All @@ -33,9 +34,15 @@ class AnonymizedFaker(BaseTransformer):
Keyword args to pass into the ``function_name`` when being called.
locales (list):
List of localized providers to use instead of the global provider.
cardinality_rule (str):
If ``'unique'`` enforce that every created value is unique.
If ``'match'`` match the cardinality of the data seen during fit.
If ``None`` do not consider cardinality.
Defaults to ``None``.
enforce_uniqueness (bool):
Whether or not to ensure that the new anonymized data is all unique. If it isn't
possible to create the requested number of rows, then an error will be raised.
**DEPRECATED** Whether or not to ensure that the new anonymized data is all unique.
If it isn't possible to create the requested number of rows, then an error will be
raised.
Defaults to ``False``.
missing_value_generation (str or None):
The way missing values are being handled. There are two strategies:
Expand All @@ -47,6 +54,8 @@ class AnonymizedFaker(BaseTransformer):

"""

# pylint: disable=too-many-instance-attributes

IS_GENERATOR = True
INPUT_SDTYPE = 'pii'

Expand Down Expand Up @@ -98,10 +107,21 @@ def _check_locales(self):
)

def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
locales=None, enforce_uniqueness=False, missing_value_generation='random'):
locales=None, cardinality_rule=None, enforce_uniqueness=False,
missing_value_generation='random'):
super().__init__()
self._data_cardinality = None
self.data_length = None
self.enforce_uniqueness = enforce_uniqueness
self.cardinality_rule = cardinality_rule.lower() if cardinality_rule else None
if enforce_uniqueness:
warnings.warn(
"The 'enforce_uniqueness' parameter is no longer supported. "
"Please use the 'cardinality_rule' parameter instead.",
FutureWarning
)
Comment on lines +116 to +121
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After the warning, if self.cardinality_rule is None, we could set self.cardinality_rule to 'unique' to match the intent of enforce_uniqueness=True. If self.cardinality_rule isn't None then the user definitely set both parameters and we should probably ignore enforce_uniqueness

if not self.cardinality_rule:
self.cardinality_rule = 'unique'

self.provider_name = provider_name if provider_name else 'BaseProvider'
if self.provider_name != 'BaseProvider' and function_name is None:
raise TransformerInputError(
Expand Down Expand Up @@ -156,7 +176,11 @@ def reset_randomization(self):

def _function(self):
"""Return the result of calling the ``faker`` function."""
faker_attr = self.faker.unique if self.enforce_uniqueness else self.faker
if self.cardinality_rule in {'unique', 'match'}:
faker_attr = self.faker.unique
else:
faker_attr = self.faker

result = getattr(faker_attr, self.function_name)(**self.function_kwargs)

if isinstance(result, Iterable) and not isinstance(result, str):
Expand Down Expand Up @@ -185,10 +209,42 @@ def _fit(self, data):
if self.missing_value_generation == 'random':
self._nan_frequency = data.isna().sum() / len(data)

if self.cardinality_rule == 'match':
# remove nans from data
self._data_cardinality = len(data.dropna().unique())

def _transform(self, _data):
"""Drop the input column by returning ``None``."""
return None

def _get_unique_categories(self, samples):
return np.array([self._function() for _ in range(samples)], dtype=object)

def _reverse_transform_cardinality_rule_match(self, sample_size):
"""Reverse transform the data when the cardinality rule is 'match'."""
reverse_transformed = np.array([], dtype=object)
if self.missing_value_generation == 'random':
num_nans = int(self._nan_frequency * sample_size)
reverse_transformed = np.concatenate([reverse_transformed, np.full(num_nans, np.nan)])
else:
num_nans = 0

if sample_size <= num_nans:
return reverse_transformed

if sample_size < num_nans + self._data_cardinality:
unique_categories = self._get_unique_categories(sample_size - num_nans)
reverse_transformed = np.concatenate([reverse_transformed, unique_categories])
else:
unique_categories = self._get_unique_categories(self._data_cardinality)
num_copies = sample_size - self._data_cardinality - num_nans
copies = np.random.choice(unique_categories, num_copies)
reverse_transformed = np.concatenate([reverse_transformed, unique_categories, copies])

np.random.shuffle(reverse_transformed)

return reverse_transformed

def _reverse_transform(self, data):
"""Generate new anonymized data using a ``faker.provider.function``.

Expand All @@ -205,18 +261,22 @@ def _reverse_transform(self, data):
sample_size = self.data_length

try:
reverse_transformed = np.array([
self._function()
for _ in range(sample_size)
], dtype=object)
if self.cardinality_rule == 'match':
reverse_transformed = self._reverse_transform_cardinality_rule_match(sample_size)
else:
reverse_transformed = np.array([
self._function()
for _ in range(sample_size)
], dtype=object)

except faker.exceptions.UniquenessException as exception:
raise TransformerProcessingError(
f'The Faker function you specified is not able to generate {sample_size} unique '
'values. Please use a different Faker function for column '
f"('{self.get_input_column()}')."
) from exception

if self.missing_value_generation == 'random':
if self.missing_value_generation == 'random' and not pd.isna(reverse_transformed).any():
num_nans = int(self._nan_frequency * sample_size)
nan_indices = np.random.choice(sample_size, num_nans, replace=False)
reverse_transformed[nan_indices] = np.nan
Expand All @@ -235,6 +295,7 @@ def __repr__(self):
args = inspect.getfullargspec(self.__init__)
keys = args.args[1:]
defaults = dict(zip(keys, args.defaults))
keys.remove('enforce_uniqueness')
instanced = {key: getattr(self, key) for key in keys}

defaults['function_name'] = None
Expand Down Expand Up @@ -283,7 +344,7 @@ def __init__(self, provider_name=None, function_name=None, function_kwargs=None,
function_name=function_name,
function_kwargs=function_kwargs,
locales=locales,
enforce_uniqueness=True
cardinality_rule='unique'
)
self._mapping_dict = {}
self._reverse_mapping_dict = {}
Expand Down
95 changes: 93 additions & 2 deletions tests/integration/transformers/pii/test_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_custom_provider_with_nans(self):
assert len(reverse_transform['cc']) == 5
assert reverse_transform['cc'].isna().sum() == 1

def test_enforce_uniqueness(self):
def test_cardinality_rule(self):
"""Test that ``AnonymizedFaker`` works with uniqueness.

Also ensure that when we call ``reset_randomization`` the generator will be able to
Expand All @@ -139,7 +139,7 @@ def test_enforce_uniqueness(self):
'job': np.arange(500)
})

instance = AnonymizedFaker('job', 'job', enforce_uniqueness=True)
instance = AnonymizedFaker('job', 'job', cardinality_rule='unique')
transformed = instance.fit_transform(data, 'job')
reverse_transform = instance.reverse_transform(transformed)

Expand All @@ -157,6 +157,97 @@ def test_enforce_uniqueness(self):
instance.reset_randomization()
instance.reverse_transform(transformed)

def test_cardinality_rule_match(self):
"""Test it works with the cardinality rule 'match'."""
# Setup
data = pd.DataFrame({
'col': [1, 2, 3, 1, 2]
})
instance = AnonymizedFaker(cardinality_rule='match')

# Run
transformed = instance.fit_transform(data, 'col')
reverse_transform = instance.reverse_transform(transformed)

# Assert
assert len(reverse_transform['col'].unique()) == 3

def test_cardinality_rule_match_nans(self):
"""Test it works with the cardinality rule 'match' with nans."""
# Setup
data = pd.DataFrame({
'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2]
})
instance = AnonymizedFaker(cardinality_rule='match')

# Run
transformed = instance.fit_transform(data, 'col')
reverse_transform = instance.reverse_transform(transformed)

# Assert
assert len(reverse_transform['col'].unique()) == 4
assert reverse_transform['col'].isna().sum() == 3

def test_cardinality_rule_match_not_enough_unique_values(self):
"""Test it works with the cardinality rule 'match' and too few values to transform."""
# Setup
data_fit = pd.DataFrame({
'col': [1, 2, 3, 1, 2, None, np.nan, np.nan, 2]
})
data_transform = pd.DataFrame({
'col': [1, 1, 1]
})
instance = AnonymizedFaker(cardinality_rule='match')

# Run
transformed = instance.fit(data_fit, 'col')
transformed = instance.transform(data_transform)
reverse_transform = instance.reverse_transform(transformed)

# Assert
assert len(reverse_transform['col'].unique()) == 3
assert reverse_transform['col'].isna().sum() == 1

def test_cardinality_rule_match_too_many_unique(self):
"""Test it works with the cardinality rule 'match' and more unique values than samples."""
# Setup
data_fit = pd.DataFrame({
'col': [1, 2, 3, 4, 5, 6]
})
data_transform = pd.DataFrame({
'col': [1, 1, np.nan, 3, 1]
})
instance = AnonymizedFaker(cardinality_rule='match')

# Run
transformed = instance.fit(data_fit, 'col')
transformed = instance.transform(data_transform)
reverse_transform = instance.reverse_transform(transformed)

# Assert
assert len(reverse_transform['col'].unique()) == 5
assert reverse_transform['col'].isna().sum() == 0

def test_cardinality_rule_match_too_many_nans(self):
"""Test it works with the cardinality rule 'match' and more nans than possible to fit."""
# Setup
data_fit = pd.DataFrame({
'col': [1, 2, 3, np.nan, np.nan, np.nan]
})
data_transform = pd.DataFrame({
'col': [1, 1, 1, 1]
})
instance = AnonymizedFaker(cardinality_rule='match')

# Run
transformed = instance.fit(data_fit, 'col')
transformed = instance.transform(data_transform)
reverse_transform = instance.reverse_transform(transformed)

# Assert
assert len(reverse_transform['col'].unique()) == 3
assert reverse_transform['col'].isna().sum() == 2


class TestPsuedoAnonymizedFaker:
def test_default_settings(self):
Expand Down
Loading
Loading