Skip to content

Commit

Permalink
Add integration tests
Browse files Browse the repository at this point in the history
  • Loading branch information
fealho committed Feb 10, 2025
1 parent d600e38 commit 4b6dba4
Show file tree
Hide file tree
Showing 3 changed files with 349 additions and 87 deletions.
8 changes: 0 additions & 8 deletions sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,14 +530,6 @@ def _detect_relationships(self, data=None):
)
continue

@staticmethod
def _validate_infer_sdtypes_and_keys(infer_sdtypes, infer_keys):
if not isinstance(infer_sdtypes, bool):
raise ValueError("'infer_sdtypes' must be a boolean value.")

if infer_keys not in ['primary_only', None]:
raise ValueError("'infer_keys' must be one of: 'primary_only', None.")

def detect_table_from_dataframe(
self, table_name, data, infer_sdtypes=True, infer_keys='primary_only'
):
Expand Down
286 changes: 286 additions & 0 deletions tests/integration/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,149 @@ def test_detect_from_dataframes_multi_table():
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframes_multi_table_without_infer_sdtypes():
"""Test it when infer_sdtypes is False."""
# Setup
real_data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')

# Run
metadata = Metadata.detect_from_dataframes(real_data, infer_sdtypes=False)

# Assert
metadata.update_column(
table_name='hotels',
column_name='classification',
sdtype='categorical',
)

expected_metadata = {
'tables': {
'hotels': {
'columns': {
'hotel_id': {'sdtype': 'unknown', 'pii': True},
'city': {'sdtype': 'unknown', 'pii': True},
'state': {'sdtype': 'unknown', 'pii': True},
'rating': {'sdtype': 'unknown', 'pii': True},
'classification': {'sdtype': 'categorical'},
},
},
'guests': {
'columns': {
'guest_email': {'sdtype': 'unknown', 'pii': True},
'hotel_id': {'sdtype': 'unknown', 'pii': True},
'has_rewards': {'sdtype': 'unknown', 'pii': True},
'room_type': {'sdtype': 'unknown', 'pii': True},
'amenities_fee': {'sdtype': 'unknown', 'pii': True},
'checkin_date': {'sdtype': 'unknown', 'pii': True},
'checkout_date': {'sdtype': 'unknown', 'pii': True},
'room_rate': {'sdtype': 'unknown', 'pii': True},
'billing_address': {'sdtype': 'unknown', 'pii': True},
'credit_card_number': {'sdtype': 'unknown', 'pii': True},
},
},
},
'relationships': [],
'METADATA_SPEC_VERSION': 'V1',
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframes_multi_table_with_infer_keys_primary_only():
"""Test it when infer_keys is 'primary_only'."""
# Setup
real_data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')

# Run
metadata = Metadata.detect_from_dataframes(real_data, infer_keys='primary_only')

# Assert
metadata.update_column(
table_name='hotels',
column_name='classification',
sdtype='categorical',
)

expected_metadata = {
'tables': {
'hotels': {
'columns': {
'hotel_id': {'sdtype': 'id'},
'city': {'sdtype': 'city', 'pii': True},
'state': {'sdtype': 'administrative_unit', 'pii': True},
'rating': {'sdtype': 'numerical'},
'classification': {'sdtype': 'categorical'},
},
'primary_key': 'hotel_id',
},
'guests': {
'columns': {
'guest_email': {'sdtype': 'email', 'pii': True},
'hotel_id': {'sdtype': 'categorical'},
'has_rewards': {'sdtype': 'categorical'},
'room_type': {'sdtype': 'categorical'},
'amenities_fee': {'sdtype': 'numerical'},
'checkin_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'},
'checkout_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'},
'room_rate': {'sdtype': 'numerical'},
'billing_address': {'sdtype': 'unknown', 'pii': True},
'credit_card_number': {'sdtype': 'credit_card_number', 'pii': True},
},
'primary_key': 'guest_email',
},
},
'relationships': [],
'METADATA_SPEC_VERSION': 'V1',
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframes_multi_table_with_infer_keys_none():
"""Test it when infer_keys is None."""
# Setup
real_data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')

# Run
metadata = Metadata.detect_from_dataframes(real_data, infer_keys=None)

# Assert
metadata.update_column(
table_name='hotels',
column_name='classification',
sdtype='categorical',
)

expected_metadata = {
'tables': {
'hotels': {
'columns': {
'hotel_id': {'sdtype': 'id'},
'city': {'sdtype': 'city', 'pii': True},
'state': {'sdtype': 'administrative_unit', 'pii': True},
'rating': {'sdtype': 'numerical'},
'classification': {'sdtype': 'categorical'},
},
},
'guests': {
'columns': {
'guest_email': {'sdtype': 'email', 'pii': True},
'hotel_id': {'sdtype': 'categorical'},
'has_rewards': {'sdtype': 'categorical'},
'room_type': {'sdtype': 'categorical'},
'amenities_fee': {'sdtype': 'numerical'},
'checkin_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'},
'checkout_date': {'sdtype': 'datetime', 'datetime_format': '%d %b %Y'},
'room_rate': {'sdtype': 'numerical'},
'billing_address': {'sdtype': 'unknown', 'pii': True},
'credit_card_number': {'sdtype': 'credit_card_number', 'pii': True},
},
},
},
'relationships': [],
'METADATA_SPEC_VERSION': 'V1',
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframes_single_table():
"""Test the ``detect_from_dataframes`` method works with a single table."""
# Setup
Expand Down Expand Up @@ -151,6 +294,93 @@ def test_detect_from_dataframes_single_table():
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframes_single_table_infer_sdtypes_false():
"""Test it for a single table when infer_sdtypes is False."""
# Setup
data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')
metadata = Metadata.detect_from_dataframes({'table_1': data['hotels']}, infer_sdtypes=False)

# Run
metadata.validate()

# Assert
expected_metadata = {
'METADATA_SPEC_VERSION': 'V1',
'tables': {
'table_1': {
'columns': {
'hotel_id': {'sdtype': 'unknown', 'pii': True},
'city': {'sdtype': 'unknown', 'pii': True},
'state': {'sdtype': 'unknown', 'pii': True},
'rating': {'sdtype': 'unknown', 'pii': True},
'classification': {'sdtype': 'unknown', 'pii': True},
},
}
},
'relationships': [],
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframes_single_table_infer_keys_primary_only():
"""Test it for a single table when infer_keys is 'primary_only'."""
# Setup
data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')
metadata = Metadata.detect_from_dataframes(
{'table_1': data['hotels']}, infer_keys='primary_only'
)

# Run
metadata.validate()

# Assert
expected_metadata = {
'METADATA_SPEC_VERSION': 'V1',
'tables': {
'table_1': {
'columns': {
'hotel_id': {'sdtype': 'id'},
'city': {'sdtype': 'city', 'pii': True},
'state': {'sdtype': 'administrative_unit', 'pii': True},
'rating': {'sdtype': 'numerical'},
'classification': {'sdtype': 'unknown', 'pii': True},
},
'primary_key': 'hotel_id',
}
},
'relationships': [],
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframes_single_table_infer_keys_none():
"""Test it for a single table when infer_keys is None."""
# Setup
data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')
metadata = Metadata.detect_from_dataframes({'table_1': data['hotels']}, infer_keys=None)

# Run
metadata.validate()

# Assert
expected_metadata = {
'METADATA_SPEC_VERSION': 'V1',
'tables': {
'table_1': {
'columns': {
'hotel_id': {'sdtype': 'id'},
'city': {'sdtype': 'city', 'pii': True},
'state': {'sdtype': 'administrative_unit', 'pii': True},
'rating': {'sdtype': 'numerical'},
'classification': {'sdtype': 'unknown', 'pii': True},
},
}
},
'relationships': [],
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframe():
"""Test that a single table can be detected as a DataFrame."""
# Setup
Expand Down Expand Up @@ -181,6 +411,62 @@ def test_detect_from_dataframe():
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframe_infer_sdtypes_false():
"""Test it when infer_sdtypes is False."""
# Setup
data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')
metadata = Metadata.detect_from_dataframe(data['hotels'], infer_sdtypes=False)

# Run
metadata.validate()

# Assert
expected_metadata = {
'METADATA_SPEC_VERSION': 'V1',
'tables': {
DEFAULT_TABLE_NAME: {
'columns': {
'hotel_id': {'sdtype': 'unknown', 'pii': True},
'city': {'sdtype': 'unknown', 'pii': True},
'state': {'sdtype': 'unknown', 'pii': True},
'rating': {'sdtype': 'unknown', 'pii': True},
'classification': {'sdtype': 'unknown', 'pii': True},
},
}
},
'relationships': [],
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_dataframe_infer_keys_none():
"""Test it when infer_keys is None."""
# Setup
data, _ = download_demo(modality='multi_table', dataset_name='fake_hotels')
metadata = Metadata.detect_from_dataframe(data['hotels'], infer_keys=None)

# Run
metadata.validate()

# Assert
expected_metadata = {
'METADATA_SPEC_VERSION': 'V1',
'tables': {
DEFAULT_TABLE_NAME: {
'columns': {
'hotel_id': {'sdtype': 'id'},
'city': {'sdtype': 'city', 'pii': True},
'state': {'sdtype': 'administrative_unit', 'pii': True},
'rating': {'sdtype': 'numerical'},
'classification': {'sdtype': 'unknown', 'pii': True},
},
}
},
'relationships': [],
}
assert metadata.to_dict() == expected_metadata


def test_detect_from_csvs(tmp_path):
"""Test the ``detect_from_csvs`` method."""
# Setup
Expand Down
Loading

0 comments on commit 4b6dba4

Please sign in to comment.