From 4b6dba42c6e25a84fbaff8e5ccc71ebd56f1b086 Mon Sep 17 00:00:00 2001 From: Felipe Date: Mon, 10 Feb 2025 08:23:34 -0800 Subject: [PATCH] Add integration tests --- sdv/metadata/multi_table.py | 8 - tests/integration/metadata/test_metadata.py | 286 ++++++++++++++++++++ tests/unit/metadata/test_single_table.py | 142 +++++----- 3 files changed, 349 insertions(+), 87 deletions(-) diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py index 9815ebf9f..32af48ab6 100644 --- a/sdv/metadata/multi_table.py +++ b/sdv/metadata/multi_table.py @@ -530,14 +530,6 @@ def _detect_relationships(self, data=None): ) continue - @staticmethod - def _validate_infer_sdtypes_and_keys(infer_sdtypes, infer_keys): - if not isinstance(infer_sdtypes, bool): - raise ValueError("'infer_sdtypes' must be a boolean value.") - - if infer_keys not in ['primary_only', None]: - raise ValueError("'infer_keys' must be one of: 'primary_only', None.") - def detect_table_from_dataframe( self, table_name, data, infer_sdtypes=True, infer_keys='primary_only' ): diff --git a/tests/integration/metadata/test_metadata.py b/tests/integration/metadata/test_metadata.py index 09f2f54b7..8b7e62cd1 100644 --- a/tests/integration/metadata/test_metadata.py +++ b/tests/integration/metadata/test_metadata.py @@ -121,6 +121,149 @@ def test_detect_from_dataframes_multi_table(): assert metadata.to_dict() == expected_metadata +def test_detect_from_dataframes_multi_table_without_infer_sdtypes(): + """Test it when infer_sdtypes is False.""" + # Setup + real_data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + + # Run + metadata = Metadata.detect_from_dataframes(real_data, infer_sdtypes=False) + + # Assert + metadata.update_column( + table_name='hotels', + column_name='classification', + sdtype='categorical', + ) + + expected_metadata = { + 'tables': { + 'hotels': { + 'columns': { + 'hotel_id': {'sdtype': 'unknown', 'pii': True}, + 'city': {'sdtype': 'unknown', 'pii': True}, + 'state': {'sdtype': 'unknown', 'pii': True}, + 'rating': {'sdtype': 'unknown', 'pii': True}, + 'classification': {'sdtype': 'categorical'}, + }, + }, + 'guests': { + 'columns': { + 'guest_email': {'sdtype': 'unknown', 'pii': True}, + 'hotel_id': {'sdtype': 'unknown', 'pii': True}, + 'has_rewards': {'sdtype': 'unknown', 'pii': True}, + 'room_type': {'sdtype': 'unknown', 'pii': True}, + 'amenities_fee': {'sdtype': 'unknown', 'pii': True}, + 'checkin_date': {'sdtype': 'unknown', 'pii': True}, + 'checkout_date': {'sdtype': 'unknown', 'pii': True}, + 'room_rate': {'sdtype': 'unknown', 'pii': True}, + 'billing_address': {'sdtype': 'unknown', 'pii': True}, + 'credit_card_number': {'sdtype': 'unknown', 'pii': True}, + }, + }, + }, + 'relationships': [], + 'METADATA_SPEC_VERSION': 'V1', + } + assert metadata.to_dict() == expected_metadata + + +def test_detect_from_dataframes_multi_table_with_infer_keys_primary_only(): + """Test it when infer_keys is 'primary_only'.""" + # Setup + real_data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + + # Run + metadata = Metadata.detect_from_dataframes(real_data, infer_keys='primary_only') + + # Assert + metadata.update_column( + table_name='hotels', + column_name='classification', + sdtype='categorical', + ) + + expected_metadata = { + 'tables': { + 'hotels': { + 'columns': { + 'hotel_id': {'sdtype': 'id'}, + 'city': {'sdtype': 'city', 'pii': True}, + 'state': {'sdtype': 'administrative_unit', 'pii': True}, + 'rating': {'sdtype': 'numerical'}, + 'classification': {'sdtype': 'categorical'}, + }, + 'primary_key': 'hotel_id', + }, + 'guests': { + 'columns': { + 'guest_email': {'sdtype': 'email', 'pii': True}, + 'hotel_id': {'sdtype': 'categorical'}, + 'has_rewards': {'sdtype': 'categorical'}, + 'room_type': {'sdtype': 'categorical'}, + 'amenities_fee': {'sdtype': 'numerical'}, + 'checkin_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'}, + 'checkout_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'}, + 'room_rate': {'sdtype': 'numerical'}, + 'billing_address': {'sdtype': 'unknown', 'pii': True}, + 'credit_card_number': {'sdtype': 'credit_card_number', 'pii': True}, + }, + 'primary_key': 'guest_email', + }, + }, + 'relationships': [], + 'METADATA_SPEC_VERSION': 'V1', + } + assert metadata.to_dict() == expected_metadata + + +def test_detect_from_dataframes_multi_table_with_infer_keys_none(): + """Test it when infer_keys is None.""" + # Setup + real_data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + + # Run + metadata = Metadata.detect_from_dataframes(real_data, infer_keys=None) + + # Assert + metadata.update_column( + table_name='hotels', + column_name='classification', + sdtype='categorical', + ) + + expected_metadata = { + 'tables': { + 'hotels': { + 'columns': { + 'hotel_id': {'sdtype': 'id'}, + 'city': {'sdtype': 'city', 'pii': True}, + 'state': {'sdtype': 'administrative_unit', 'pii': True}, + 'rating': {'sdtype': 'numerical'}, + 'classification': {'sdtype': 'categorical'}, + }, + }, + 'guests': { + 'columns': { + 'guest_email': {'sdtype': 'email', 'pii': True}, + 'hotel_id': {'sdtype': 'categorical'}, + 'has_rewards': {'sdtype': 'categorical'}, + 'room_type': {'sdtype': 'categorical'}, + 'amenities_fee': {'sdtype': 'numerical'}, + 'checkin_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'}, + 'checkout_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'}, + 'room_rate': {'sdtype': 'numerical'}, + 'billing_address': {'sdtype': 'unknown', 'pii': True}, + 'credit_card_number': {'sdtype': 'credit_card_number', 'pii': True}, + }, + }, + }, + 'relationships': [], + 'METADATA_SPEC_VERSION': 'V1', + } + assert metadata.to_dict() == expected_metadata + + def test_detect_from_dataframes_single_table(): """Test the ``detect_from_dataframes`` method works with a single table.""" # Setup @@ -151,6 +294,93 @@ def test_detect_from_dataframes_single_table(): assert metadata.to_dict() == expected_metadata +def test_detect_from_dataframes_single_table_infer_sdtypes_false(): + """Test it for a single table when infer_sdtypes is False.""" + # Setup + data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + metadata = Metadata.detect_from_dataframes({'table_1': data['hotels']}, infer_sdtypes=False) + + # Run + metadata.validate() + + # Assert + expected_metadata = { + 'METADATA_SPEC_VERSION': 'V1', + 'tables': { + 'table_1': { + 'columns': { + 'hotel_id': {'sdtype': 'unknown', 'pii': True}, + 'city': {'sdtype': 'unknown', 'pii': True}, + 'state': {'sdtype': 'unknown', 'pii': True}, + 'rating': {'sdtype': 'unknown', 'pii': True}, + 'classification': {'sdtype': 'unknown', 'pii': True}, + }, + } + }, + 'relationships': [], + } + assert metadata.to_dict() == expected_metadata + + +def test_detect_from_dataframes_single_table_infer_keys_primary_only(): + """Test it for a single table when infer_keys is 'primary_only'.""" + # Setup + data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + metadata = Metadata.detect_from_dataframes( + {'table_1': data['hotels']}, infer_keys='primary_only' + ) + + # Run + metadata.validate() + + # Assert + expected_metadata = { + 'METADATA_SPEC_VERSION': 'V1', + 'tables': { + 'table_1': { + 'columns': { + 'hotel_id': {'sdtype': 'id'}, + 'city': {'sdtype': 'city', 'pii': True}, + 'state': {'sdtype': 'administrative_unit', 'pii': True}, + 'rating': {'sdtype': 'numerical'}, + 'classification': {'sdtype': 'unknown', 'pii': True}, + }, + 'primary_key': 'hotel_id', + } + }, + 'relationships': [], + } + assert metadata.to_dict() == expected_metadata + + +def test_detect_from_dataframes_single_table_infer_keys_none(): + """Test it for a single table when infer_keys is None.""" + # Setup + data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + metadata = Metadata.detect_from_dataframes({'table_1': data['hotels']}, infer_keys=None) + + # Run + metadata.validate() + + # Assert + expected_metadata = { + 'METADATA_SPEC_VERSION': 'V1', + 'tables': { + 'table_1': { + 'columns': { + 'hotel_id': {'sdtype': 'id'}, + 'city': {'sdtype': 'city', 'pii': True}, + 'state': {'sdtype': 'administrative_unit', 'pii': True}, + 'rating': {'sdtype': 'numerical'}, + 'classification': {'sdtype': 'unknown', 'pii': True}, + }, + } + }, + 'relationships': [], + } + assert metadata.to_dict() == expected_metadata + + def test_detect_from_dataframe(): """Test that a single table can be detected as a DataFrame.""" # Setup @@ -181,6 +411,62 @@ def test_detect_from_dataframe(): assert metadata.to_dict() == expected_metadata +def test_detect_from_dataframe_infer_sdtypes_false(): + """Test it when infer_sdtypes is False.""" + # Setup + data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + metadata = Metadata.detect_from_dataframe(data['hotels'], infer_sdtypes=False) + + # Run + metadata.validate() + + # Assert + expected_metadata = { + 'METADATA_SPEC_VERSION': 'V1', + 'tables': { + DEFAULT_TABLE_NAME: { + 'columns': { + 'hotel_id': {'sdtype': 'unknown', 'pii': True}, + 'city': {'sdtype': 'unknown', 'pii': True}, + 'state': {'sdtype': 'unknown', 'pii': True}, + 'rating': {'sdtype': 'unknown', 'pii': True}, + 'classification': {'sdtype': 'unknown', 'pii': True}, + }, + } + }, + 'relationships': [], + } + assert metadata.to_dict() == expected_metadata + + +def test_detect_from_dataframe_infer_keys_none(): + """Test it when infer_keys is None.""" + # Setup + data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels') + metadata = Metadata.detect_from_dataframe(data['hotels'], infer_keys=None) + + # Run + metadata.validate() + + # Assert + expected_metadata = { + 'METADATA_SPEC_VERSION': 'V1', + 'tables': { + DEFAULT_TABLE_NAME: { + 'columns': { + 'hotel_id': {'sdtype': 'id'}, + 'city': {'sdtype': 'city', 'pii': True}, + 'state': {'sdtype': 'administrative_unit', 'pii': True}, + 'rating': {'sdtype': 'numerical'}, + 'classification': {'sdtype': 'unknown', 'pii': True}, + }, + } + }, + 'relationships': [], + } + assert metadata.to_dict() == expected_metadata + + def test_detect_from_csvs(tmp_path): """Test the ``detect_from_csvs`` method.""" # Setup diff --git a/tests/unit/metadata/test_single_table.py b/tests/unit/metadata/test_single_table.py index 9f0b71a8e..ca29a1434 100644 --- a/tests/unit/metadata/test_single_table.py +++ b/tests/unit/metadata/test_single_table.py @@ -76,6 +76,44 @@ class TestSingleTableMetadata: ), ] # noqa: JS102 + @pytest.fixture + def data(self): + return pd.DataFrame({ + 'id': ['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11'], + 'numerical': [1, 2, 3, 2, 5, 6, 7, 8, 9, 10, 11], + 'datetime': [ + '2022-01-01', + '2022-02-01', + '2022-03-01', + '2022-04-01', + '2022-05-01', + '2022-06-01', + '2022-07-01', + '2022-08-01', + '2022-09-01', + '2022-10-01', + '2022-11-01', + ], + 'alternate_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + 'alternate_id_string': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], + 'categorical': ['a', 'b', 'a', 'a', 'b', 'b', 'a', 'b', 'a', 'b', 'a'], + 'bool': [True, False, True, False, True, False, True, False, True, False, True], + 'unknown': ['a', 'b', 'c', 'c', 1, 2.2, np.nan, None, 'd', 'e', 'f'], + 'first_name': [ + 'John', + 'Jane', + 'John', + 'Jane', + 'John', + 'Jane', + 'John', + 'Jane', + 'John', + 'Jane', + 'John', + ], + }) + def test___init__(self): """Test creating an instance of ``SingleTableMetadata``.""" # Run @@ -1111,46 +1149,10 @@ def test__determine_sdtype_for_objects_with_none(self): # Assert assert sdtype == 'categorical' - def test__detect_columns(self): + def test__detect_columns(self, data): """Test the ``_detect_columns`` method.""" # Setup instance = SingleTableMetadata() - data = pd.DataFrame({ - 'id': ['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11'], - 'numerical': [1, 2, 3, 2, 5, 6, 7, 8, 9, 10, 11], - 'datetime': [ - '2022-01-01', - '2022-02-01', - '2022-03-01', - '2022-04-01', - '2022-05-01', - '2022-06-01', - '2022-07-01', - '2022-08-01', - '2022-09-01', - '2022-10-01', - '2022-11-01', - ], - 'alternate_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - 'alternate_id_string': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], - 'categorical': ['a', 'b', 'a', 'a', 'b', 'b', 'a', 'b', 'a', 'b', 'a'], - 'bool': [True, False, True, False, True, False, True, False, True, False, True], - 'unknown': ['a', 'b', 'c', 'c', 1, 2.2, np.nan, None, 'd', 'e', 'f'], - 'first_name': [ - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - ], - }) - expected_datetime_format = '%Y-%m-%d' # Run @@ -1320,62 +1322,44 @@ def test__detect_columns_invalid_data_format(self): with pytest.raises(InvalidMetadataError, match=expected_error_message): instance._detect_columns(data) - def test__detect_columns_without_infer_sdtypes(self): + def test__detect_columns_without_infer_sdtypes(self, data): """Test the _detect_columns when infer_sdtypes is False.""" # Setup instance = SingleTableMetadata() - data = pd.DataFrame({ - 'id': ['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11'], - 'numerical': [1, 2, 3, 2, 5, 6, 7, 8, 9, 10, 11], - 'datetime': [ - '2022-01-01', - '2022-02-01', - '2022-03-01', - '2022-04-01', - '2022-05-01', - '2022-06-01', - '2022-07-01', - '2022-08-01', - '2022-09-01', - '2022-10-01', - '2022-11-01', - ], - 'alternate_id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - 'alternate_id_string': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], - 'categorical': ['a', 'b', 'a', 'a', 'b', 'b', 'a', 'b', 'a', 'b', 'a'], - 'bool': [True, False, True, False, True, False, True, False, True, False, True], - 'unknown': ['a', 'b', 'c', 'c', 1, 2.2, np.nan, None, 'd', 'e', 'f'], - 'first_name': [ - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - 'Jane', - 'John', - ], - }) # Run instance._detect_columns(data, infer_sdtypes=False) # Assert - assert instance.columns['id']['sdtype'] == 'unknown' - assert instance.columns['numerical']['sdtype'] == 'unknown' - assert instance.columns['datetime']['sdtype'] == 'unknown' + for column in data.columns: + assert instance.columns[column]['sdtype'] == 'unknown' + assert instance.columns[column]['pii'] is True + + assert instance.primary_key is None + assert instance._updated is True + + def test__detect_columns_without_infer_keys(self, data): + """Test the _detect_columns when infer_keys is False.""" + # Setup + instance = SingleTableMetadata() + + # Run + instance._detect_columns(data, infer_keys=None) + + # Assert + assert instance.columns['id']['sdtype'] == 'id' + assert instance.columns['numerical']['sdtype'] == 'numerical' + assert instance.columns['datetime']['sdtype'] == 'datetime' + assert instance.columns['datetime']['datetime_format'] == '%Y-%m-%d' assert instance.columns['alternate_id']['sdtype'] == 'unknown' assert instance.columns['alternate_id']['pii'] is True assert instance.columns['alternate_id_string']['sdtype'] == 'unknown' assert instance.columns['alternate_id_string']['pii'] is True - assert instance.columns['categorical']['sdtype'] == 'unknown' + assert instance.columns['categorical']['sdtype'] == 'categorical' assert instance.columns['unknown']['sdtype'] == 'unknown' assert instance.columns['unknown']['pii'] is True - assert instance.columns['bool']['sdtype'] == 'unknown' - assert instance.columns['first_name']['sdtype'] == 'unknown' + assert instance.columns['bool']['sdtype'] == 'categorical' + assert instance.columns['first_name']['sdtype'] == 'first_name' assert instance.columns['first_name']['pii'] is True assert instance.primary_key is None