From 98c50d09d6780beb92acd70cdc614166db66a0cf Mon Sep 17 00:00:00 2001 From: John La Date: Wed, 25 Oct 2023 11:53:52 -0500 Subject: [PATCH] Issue sdv 1439 move anonymization (#729) --- rdt/transformers/pii/anonymization.py | 108 ++++++++++++++ setup.py | 2 +- .../transformers/pii/test_anonymization.py | 25 ++++ .../transformers/pii/test_anonymization.py | 141 ++++++++++++++++++ 4 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 rdt/transformers/pii/anonymization.py create mode 100644 tests/integration/transformers/pii/test_anonymization.py create mode 100644 tests/unit/transformers/pii/test_anonymization.py diff --git a/rdt/transformers/pii/anonymization.py b/rdt/transformers/pii/anonymization.py new file mode 100644 index 000000000..eb4443d37 --- /dev/null +++ b/rdt/transformers/pii/anonymization.py @@ -0,0 +1,108 @@ +"""Anonymization module for the RDT PII Transformer.""" + +import inspect +import warnings +from functools import lru_cache + +from faker import Faker +from faker.config import AVAILABLE_LOCALES + +from rdt.transformers import AnonymizedFaker + +SDTYPE_ANONYMIZERS = { + 'address': { + 'provider_name': 'address', + 'function_name': 'address' + }, + 'email': { + 'provider_name': 'internet', + 'function_name': 'email' + }, + 'ipv4_address': { + 'provider_name': 'internet', + 'function_name': 'ipv4' + }, + 'ipv6_address': { + 'provider_name': 'internet', + 'function_name': 'ipv6' + }, + 'mac_address': { + 'provider_name': 'internet', + 'function_name': 'mac_address' + }, + 'name': { + 'provider_name': 'person', + 'function_name': 'name' + }, + 'phone_number': { + 'provider_name': 'phone_number', + 'function_name': 'phone_number' + }, + 'ssn': { + 'provider_name': 'ssn', + 'function_name': 'ssn' + }, + 'user_agent_string': { + 'provider_name': 'user_agent', + 'function_name': 'user_agent' + }, +} + + +@lru_cache() +def get_faker_instance(): + """Return a ``faker.Faker`` instance with all the locales.""" + return Faker(AVAILABLE_LOCALES) + + +def is_faker_function(function_name): + """Return whether or not the function name is a valid Faker function. + + Args: + function_name (str): + String representing predefined ``sdtype`` or a ``faker`` function. + + Returns: + True if the ``function_name`` is know to ``Faker``, otherwise False. + """ + try: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', module='faker') + getattr(get_faker_instance(), function_name) + except AttributeError: + return False + + return True + + +def _detect_provider_name(function_name, locales=None): + function_name = getattr(Faker(locale=locales), function_name) + module = inspect.getmodule(function_name).__name__ + module = module.split('.') + if len(module) == 2: + return 'BaseProvider' + return '.'.join(module[2:]) + + +def get_anonymized_transformer(function_name, transformer_kwargs=None): + """Get an instance with an ``AnonymizedFaker`` for the given ``function_name``. + + Args: + function_name (str): + String representing predefined ``sdtype`` or a ``faker`` function. + transformer_kwargs (dict): + Keyword args to pass into AnonymizedFaker transformer. Optional. + """ + transformer_kwargs = transformer_kwargs or {} + locales = transformer_kwargs.get('locales') + if function_name in SDTYPE_ANONYMIZERS: + transformer_kwargs.update(SDTYPE_ANONYMIZERS[function_name]) + return AnonymizedFaker(**transformer_kwargs) + + provider_name = _detect_provider_name(function_name, locales=locales) + transformer_kwargs.update({ + 'function_name': function_name, + 'provider_name': provider_name + }) + + return AnonymizedFaker(**transformer_kwargs) diff --git a/setup.py b/setup.py index d94dc7e3f..c7db55c30 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ "scipy>=1.9.2,<2;python_version>='3.10'", "scikit-learn>=0.24,<2;python_version<'3.10'", "scikit-learn>=1.1.3,<2;python_version>='3.10'", - 'Faker>=10', + 'Faker>=17,<20', ] copulas_requires = [ diff --git a/tests/integration/transformers/pii/test_anonymization.py b/tests/integration/transformers/pii/test_anonymization.py new file mode 100644 index 000000000..dc53a42ed --- /dev/null +++ b/tests/integration/transformers/pii/test_anonymization.py @@ -0,0 +1,25 @@ +from faker import Faker + +from rdt.transformers.pii.anonymization import is_faker_function + + +def test_is_faker_function(): + """Test is_faker_function checks if function is a valid Faker function.""" + # Run + result = is_faker_function('address') + + # Assert + assert result is True + + +def test_is_faker_function_non_default_locale(): + """Test is_faker_function checks non-default locales.""" + # Setup + function_name = 'postcode_in_province' + + # Run + result = is_faker_function(function_name) + + # Assert + assert result is True + assert not hasattr(Faker(), function_name) diff --git a/tests/unit/transformers/pii/test_anonymization.py b/tests/unit/transformers/pii/test_anonymization.py new file mode 100644 index 000000000..1cfa79ee6 --- /dev/null +++ b/tests/unit/transformers/pii/test_anonymization.py @@ -0,0 +1,141 @@ +from unittest.mock import Mock, patch + +from rdt.transformers.pii.anonymization import ( + _detect_provider_name, get_anonymized_transformer, get_faker_instance, is_faker_function) + + +class TestAnonimization: + + def test__detect_provider_name(self): + """Test the ``_detect_provider_name`` method. + + Test that the function returns an expected provider name from the ``faker.Faker`` instance. + If this is from the ``BaseProvider`` it should also return that name. + + Input: + - Faker function name. + + Output: + - The faker provider name for that function. + """ + # Run / Assert + email_provider = _detect_provider_name('email') + lexify_provider = _detect_provider_name('lexify') + state_provider = _detect_provider_name('state') + + assert email_provider == 'internet' + assert lexify_provider == 'BaseProvider' + assert state_provider == 'address.en_US' + + @patch('rdt.transformers.pii.anonymization.AnonymizedFaker') + def test_get_anonymized_transformer_with_existing_sdtype(self, mock_anonymized_faker): + """Test the ``get_anonymized_transformer`` method. + + Test that when calling with an existing ``sdtype`` / ``function_name`` from the + ``SDTYPE_ANONYMZIERS`` dictionary, their ``provider_name`` and ``function_name`` are being + used by default, and also other ``kwargs`` and provided locales are being passed to the + ``AnonymizedFaker``. + + Input: + - ``function_name`` from the ``SDTYPE_ANONYMIZERS``. + - ``function_kwargs`` additional keyword arguments for that set of arguments. + + Mock: + - Mock ``AnonymizedFaker`` and assert that has been called with the expected + arguments. + + Output: + - The return value must be the instance of ``AnonymizedFaker``. + """ + # Setup + output = get_anonymized_transformer('email', transformer_kwargs={ + 'function_kwargs': {'domain': '@gmail.com'}, 'locales': ['en_CA', 'fr_CA'] + }) + + # Assert + assert output == mock_anonymized_faker.return_value + mock_anonymized_faker.assert_called_once_with( + provider_name='internet', + function_name='email', + function_kwargs={'domain': '@gmail.com'}, + locales=['en_CA', 'fr_CA'] + ) + + @patch('rdt.transformers.pii.anonymization.AnonymizedFaker') + def test_get_anonymized_transformer_with_custom_sdtype(self, mock_anonymized_faker): + """Test the ``get_anonymized_transformer`` method. + + Test that when calling with a custom ``sdtype`` / ``function_name`` that does not belong + to the ``SDTYPE_ANONYMZIERS`` dictionary. The ``provider_name`` is being found + automatically other ``kwargs`` and provided locales are being passed to the + ``AnonymizedFaker``. + + Input: + - ``function_name`` color. + - ``function_kwargs`` a dictionary with ``'hue': 'red'``. + + Mock: + - Mock ``AnonymizedFaker`` and assert that has been called with the expected + arguments. + + Output: + - The return value must be the instance of ``AnonymizedFaker``. + """ + # Setup + output = get_anonymized_transformer('color', transformer_kwargs={ + 'function_kwargs': {'hue': 'red'}, 'locales': ['en_CA', 'fr_CA'] + }) + + # Assert + assert output == mock_anonymized_faker.return_value + mock_anonymized_faker.assert_called_once_with( + provider_name='color', + function_name='color', + function_kwargs={'hue': 'red'}, + locales=['en_CA', 'fr_CA'] + ) + + @patch('rdt.transformers.pii.anonymization.Faker') + def test_is_faker_function(self, faker_mock): + """Test that the method returns True if the ``function_name`` is a valid faker function. + + This test mocks the ``Faker`` method to make sure that the ``function_name`` is an + attribute it has. + """ + # Setup + faker_mock.return_value = Mock(spec=['address']) + + # Run + result = is_faker_function('address') + + # Assert + assert result is True + + @patch('rdt.transformers.pii.anonymization.get_faker_instance') + def test_is_faker_function_error(self, mock_get_faker_instance): + """Test that the method returns False if ``function_name`` is not a valid faker function. + + If the ``function_name`` is not an attribute of ``Faker()`` then we should return false. + This test mocks ``Faker`` to not have the attribute that is passed as ``function_name``. + """ + # Setup + mock_get_faker_instance.return_value = Mock(spec=[]) + + # Run + result = is_faker_function('blah') + + # Assert + assert result is False + mock_get_faker_instance.assert_called_once() + + @patch('rdt.transformers.pii.anonymization.Faker') + def test_get_faker_instance(self, mock_faker): + """Test that ``get_faker_instance`` returns the same object.""" + # Setup + first_instance = get_faker_instance() + + # Run + second_instance = get_faker_instance() + + # Assert + assert id(first_instance) == id(second_instance)