Skip to content

Commit

Permalink
Issue sdv 1439 move anonymization (#729)
Browse files Browse the repository at this point in the history
  • Loading branch information
lajohn4747 authored Oct 25, 2023
1 parent 2ee0077 commit 98c50d0
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 1 deletion.
108 changes: 108 additions & 0 deletions rdt/transformers/pii/anonymization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Anonymization module for the RDT PII Transformer."""

import inspect
import warnings
from functools import lru_cache

from faker import Faker
from faker.config import AVAILABLE_LOCALES

from rdt.transformers import AnonymizedFaker

SDTYPE_ANONYMIZERS = {
'address': {
'provider_name': 'address',
'function_name': 'address'
},
'email': {
'provider_name': 'internet',
'function_name': 'email'
},
'ipv4_address': {
'provider_name': 'internet',
'function_name': 'ipv4'
},
'ipv6_address': {
'provider_name': 'internet',
'function_name': 'ipv6'
},
'mac_address': {
'provider_name': 'internet',
'function_name': 'mac_address'
},
'name': {
'provider_name': 'person',
'function_name': 'name'
},
'phone_number': {
'provider_name': 'phone_number',
'function_name': 'phone_number'
},
'ssn': {
'provider_name': 'ssn',
'function_name': 'ssn'
},
'user_agent_string': {
'provider_name': 'user_agent',
'function_name': 'user_agent'
},
}


@lru_cache()
def get_faker_instance():
"""Return a ``faker.Faker`` instance with all the locales."""
return Faker(AVAILABLE_LOCALES)


def is_faker_function(function_name):
"""Return whether or not the function name is a valid Faker function.
Args:
function_name (str):
String representing predefined ``sdtype`` or a ``faker`` function.
Returns:
True if the ``function_name`` is know to ``Faker``, otherwise False.
"""
try:
with warnings.catch_warnings():
warnings.filterwarnings('ignore', module='faker')
getattr(get_faker_instance(), function_name)
except AttributeError:
return False

return True


def _detect_provider_name(function_name, locales=None):
function_name = getattr(Faker(locale=locales), function_name)
module = inspect.getmodule(function_name).__name__
module = module.split('.')
if len(module) == 2:
return 'BaseProvider'
return '.'.join(module[2:])


def get_anonymized_transformer(function_name, transformer_kwargs=None):
"""Get an instance with an ``AnonymizedFaker`` for the given ``function_name``.
Args:
function_name (str):
String representing predefined ``sdtype`` or a ``faker`` function.
transformer_kwargs (dict):
Keyword args to pass into AnonymizedFaker transformer. Optional.
"""
transformer_kwargs = transformer_kwargs or {}
locales = transformer_kwargs.get('locales')
if function_name in SDTYPE_ANONYMIZERS:
transformer_kwargs.update(SDTYPE_ANONYMIZERS[function_name])
return AnonymizedFaker(**transformer_kwargs)

provider_name = _detect_provider_name(function_name, locales=locales)
transformer_kwargs.update({
'function_name': function_name,
'provider_name': provider_name
})

return AnonymizedFaker(**transformer_kwargs)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"scipy>=1.9.2,<2;python_version>='3.10'",
"scikit-learn>=0.24,<2;python_version<'3.10'",
"scikit-learn>=1.1.3,<2;python_version>='3.10'",
'Faker>=10',
'Faker>=17,<20',
]

copulas_requires = [
Expand Down
25 changes: 25 additions & 0 deletions tests/integration/transformers/pii/test_anonymization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from faker import Faker

from rdt.transformers.pii.anonymization import is_faker_function


def test_is_faker_function():
"""Test is_faker_function checks if function is a valid Faker function."""
# Run
result = is_faker_function('address')

# Assert
assert result is True


def test_is_faker_function_non_default_locale():
"""Test is_faker_function checks non-default locales."""
# Setup
function_name = 'postcode_in_province'

# Run
result = is_faker_function(function_name)

# Assert
assert result is True
assert not hasattr(Faker(), function_name)
141 changes: 141 additions & 0 deletions tests/unit/transformers/pii/test_anonymization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
from unittest.mock import Mock, patch

from rdt.transformers.pii.anonymization import (
_detect_provider_name, get_anonymized_transformer, get_faker_instance, is_faker_function)


class TestAnonimization:

def test__detect_provider_name(self):
"""Test the ``_detect_provider_name`` method.
Test that the function returns an expected provider name from the ``faker.Faker`` instance.
If this is from the ``BaseProvider`` it should also return that name.
Input:
- Faker function name.
Output:
- The faker provider name for that function.
"""
# Run / Assert
email_provider = _detect_provider_name('email')
lexify_provider = _detect_provider_name('lexify')
state_provider = _detect_provider_name('state')

assert email_provider == 'internet'
assert lexify_provider == 'BaseProvider'
assert state_provider == 'address.en_US'

@patch('rdt.transformers.pii.anonymization.AnonymizedFaker')
def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_faker):
"""Test the ``get_anonymized_transformer`` method.
Test that when calling with an existing ``sdtype`` / ``function_name`` from the
``SDTYPE_ANONYMZIERS`` dictionary, their ``provider_name`` and ``function_name`` are being
used by default, and also other ``kwargs`` and provided locales are being passed to the
``AnonymizedFaker``.
Input:
- ``function_name`` from the ``SDTYPE_ANONYMIZERS``.
- ``function_kwargs`` additional keyword arguments for that set of arguments.
Mock:
- Mock ``AnonymizedFaker`` and assert that has been called with the expected
arguments.
Output:
- The return value must be the instance of ``AnonymizedFaker``.
"""
# Setup
output = get_anonymized_transformer('email', transformer_kwargs={
'function_kwargs': {'domain': '@gmail.com'}, 'locales': ['en_CA', 'fr_CA']
})

# Assert
assert output == mock_anonymized_faker.return_value
mock_anonymized_faker.assert_called_once_with(
provider_name='internet',
function_name='email',
function_kwargs={'domain': '@gmail.com'},
locales=['en_CA', 'fr_CA']
)

@patch('rdt.transformers.pii.anonymization.AnonymizedFaker')
def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_faker):
"""Test the ``get_anonymized_transformer`` method.
Test that when calling with a custom ``sdtype`` / ``function_name`` that does not belong
to the ``SDTYPE_ANONYMZIERS`` dictionary. The ``provider_name`` is being found
automatically other ``kwargs`` and provided locales are being passed to the
``AnonymizedFaker``.
Input:
- ``function_name`` color.
- ``function_kwargs`` a dictionary with ``'hue': 'red'``.
Mock:
- Mock ``AnonymizedFaker`` and assert that has been called with the expected
arguments.
Output:
- The return value must be the instance of ``AnonymizedFaker``.
"""
# Setup
output = get_anonymized_transformer('color', transformer_kwargs={
'function_kwargs': {'hue': 'red'}, 'locales': ['en_CA', 'fr_CA']
})

# Assert
assert output == mock_anonymized_faker.return_value
mock_anonymized_faker.assert_called_once_with(
provider_name='color',
function_name='color',
function_kwargs={'hue': 'red'},
locales=['en_CA', 'fr_CA']
)

@patch('rdt.transformers.pii.anonymization.Faker')
def test_is_faker_function(self, faker_mock):
"""Test that the method returns True if the ``function_name`` is a valid faker function.
This test mocks the ``Faker`` method to make sure that the ``function_name`` is an
attribute it has.
"""
# Setup
faker_mock.return_value = Mock(spec=['address'])

# Run
result = is_faker_function('address')

# Assert
assert result is True

@patch('rdt.transformers.pii.anonymization.get_faker_instance')
def test_is_faker_function_error(self, mock_get_faker_instance):
"""Test that the method returns False if ``function_name`` is not a valid faker function.
If the ``function_name`` is not an attribute of ``Faker()`` then we should return false.
This test mocks ``Faker`` to not have the attribute that is passed as ``function_name``.
"""
# Setup
mock_get_faker_instance.return_value = Mock(spec=[])

# Run
result = is_faker_function('blah')

# Assert
assert result is False
mock_get_faker_instance.assert_called_once()

@patch('rdt.transformers.pii.anonymization.Faker')
def test_get_faker_instance(self, mock_faker):
"""Test that ``get_faker_instance`` returns the same object."""
# Setup
first_instance = get_faker_instance()

# Run
second_instance = get_faker_instance()

# Assert
assert id(first_instance) == id(second_instance)

0 comments on commit 98c50d0

Please sign in to comment.