From 4ba24e8c1b477c701f20838229474a57d01a2416 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Wed, 14 Aug 2024 14:54:49 -0500 Subject: [PATCH 1/8] =?UTF-8?q?Bump=20version:=201.12.3=20=E2=86=92=201.12?= =?UTF-8?q?.4.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- rdt/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 728ea2e0..8b9a9cb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,7 +137,7 @@ collect_ignore = ['pyproject.toml'] exclude_lines = ['NotImplementedError()'] [tool.bumpversion] -current_version = "1.12.3" +current_version = "1.12.4.dev0" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/rdt/__init__.py b/rdt/__init__.py index dbaaf673..a6b78122 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -4,7 +4,7 @@ __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' -__version__ = '1.12.3' +__version__ = '1.12.4.dev0' import sys From 23df922e7538ff82fb253434bbb19ca501f3e28a Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:39:10 -0400 Subject: [PATCH 2/8] Latest Code Analysis (#868) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- static_code_analysis.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/static_code_analysis.txt b/static_code_analysis.txt index acc2df0d..af2bb1bb 100644 --- a/static_code_analysis.txt +++ b/static_code_analysis.txt @@ -1,10 +1,10 @@ -Run started:2024-07-09 19:56:52.363070 +Run started:2024-08-14 20:11:54.714181 Test results: No issues identified. Code scanned: - Total lines of code: 5539 + Total lines of code: 5530 Total lines skipped (#nosec): 0 Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0 From 8cccc819ab554d9ec6870b96dba195159c2a16be Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:19:47 -0400 Subject: [PATCH 3/8] Automated Latest Dependency Updates (#870) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- latest_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/latest_requirements.txt b/latest_requirements.txt index 0d039b1a..4d27b67b 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -1,4 +1,4 @@ -Faker==26.3.0 +Faker==27.0.0 copulas==0.11.0 numpy==2.0.1 pandas==2.2.2 From 2fab0919dd8864a9becad9c9fe8c2e5e2fff8b22 Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Fri, 23 Aug 2024 08:23:38 +0200 Subject: [PATCH 4/8] Make create_anonymized_columns work with multi columns transformer (#872) --- rdt/hyper_transformer.py | 17 +++- tests/unit/test_hyper_transformer.py | 112 +++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 1 deletion(-) diff --git a/rdt/hyper_transformer.py b/rdt/hyper_transformer.py index 799a05a4..1ff2e8d9 100644 --- a/rdt/hyper_transformer.py +++ b/rdt/hyper_transformer.py @@ -871,8 +871,23 @@ def create_anonymized_columns(self, num_rows, column_names): 'list of valid column names.' ) + columns_to_generate = set() + for column in column_names: + if column not in self._multi_column_fields: + columns_to_generate.add(column) + continue + + multi_columns = self._multi_column_fields[column] + if any(col not in column_names for col in multi_columns): + raise InvalidConfigError( + f"Column '{column}' is part of a multi-column field. You must include all " + 'columns inside the multi-column field to generate the anonymized columns.' + ) + + columns_to_generate.add(multi_columns) + transformers = [] - for column_name in column_names: + for column_name in sorted(columns_to_generate): transformer = self.field_transformers.get(column_name) if not transformer.is_generator(): raise TransformerProcessingError( diff --git a/tests/unit/test_hyper_transformer.py b/tests/unit/test_hyper_transformer.py index 6260ecc9..dbdc2e80 100644 --- a/tests/unit/test_hyper_transformer.py +++ b/tests/unit/test_hyper_transformer.py @@ -1478,6 +1478,7 @@ def test_create_anonymized_columns(self): instance._modified_config = False instance._subset.return_value = False instance.random_state = {} + instance._multi_column_fields = {} random_element = AnonymizedFaker( function_name='random_element', function_kwargs={'elements': ['a']} @@ -1622,6 +1623,7 @@ def test_create_anonymized_columns_invalid_transformers(self): instance._fitted = True instance._modified_config = False instance._subset.return_value = False + instance._multi_column_fields = {} instance.field_transformers = { 'datetime': FloatFormatter(), @@ -1641,6 +1643,116 @@ def test_create_anonymized_columns_invalid_transformers(self): column_names=['datetime', 'random_element'], ) + def test_create_anonymized_columns_multi_column_transformer(self): + """Test ``create_anonymized_columns`` with a multi-column transformer.""" + + class GeneratorTransformer(BaseMultiColumnTransformer): + IS_GENERATOR = True + + def __init__(self): + super().__init__() + self.output_properties = {} + + def _fit(self, data): + self.columns = list(data.columns) + + def _transform(self, data): + return pd.DataFrame() + + def _get_prefix(self): + return + + def _reverse_transform(self, data): + num_rows = data.shape[0] + for column in self.columns: + data[column] = np.arange(num_rows) + + return data + + # Setup + instance = HyperTransformer() + instance._multi_column_fields = { + 'col1': ('col1', 'col2'), + 'col2': ('col1', 'col2'), + } + generator = GeneratorTransformer() + instance.field_transformers = { + ('col1', 'col2'): generator, + } + instance.field_sdtypes = { + 'col1': 'numerical', + 'col2': 'numerical', + } + instance.fit(pd.DataFrame({'col1': [1, 2, 3], 'col2': [1, 2, 3]})) + + # Run + output = instance.create_anonymized_columns(num_rows=5, column_names=['col1', 'col2']) + + # Assert + expected_output = pd.DataFrame({ + 'col1': [0, 1, 2, 3, 4], + 'col2': [0, 1, 2, 3, 4], + }) + pd.testing.assert_frame_equal(output, expected_output, check_dtype=False) + + def test_create_anonymized_columns_multi_column_transformer_error(self): + """Test ``create_anonymized_columns`` raises error with multi-column transformer. + + Test that: + - An error occurs when some columns in the column_name list are part of a multi-column + transformer, but not all the required columns of the multi-column + transformer are present. + - An error is raised when a multi-column transformer is not a generator. + """ + + class MultiColumnTransformer(BaseMultiColumnTransformer): + IS_GENERATOR = False + + def __init__(self): + super().__init__() + self.output_properties = {} + + def _fit(self, data): + self.columns = list(data.columns) + + def _transform(self, data): + return pd.DataFrame() + + def _get_prefix(self): + return + + # Setup + instance = HyperTransformer() + instance._multi_column_fields = { + 'col1': ('col1', 'col2'), + 'col2': ('col1', 'col2'), + } + not_generator = MultiColumnTransformer() + instance.field_transformers = { + ('col1', 'col2'): not_generator, + } + instance.field_sdtypes = { + 'col1': 'numerical', + 'col2': 'numerical', + } + instance.fit(pd.DataFrame({'col1': [1, 2, 3], 'col2': [1, 2, 3]})) + + # Run and Assert + error_msg_not_all_multi_column = re.escape( + "Column 'col1' is part of a multi-column field. You must include all " + 'columns inside the multi-column field to generate the anonymized columns.' + ) + with pytest.raises(InvalidConfigError, match=error_msg_not_all_multi_column): + instance.create_anonymized_columns(num_rows=5, column_names=['col1']) + + error_msg_not_generator = re.escape( + "Column '('col1', 'col2')' cannot be anonymized. All columns must be assigned to " + "'AnonymizedFaker', 'RegexGenerator' or other ``generator``. Use " + "'get_config()' to see the current transformer assignments." + ) + with pytest.raises(TransformerProcessingError, match=error_msg_not_generator): + instance.create_anonymized_columns(num_rows=5, column_names=['col1', 'col2']) + def test_reverse_transform(self): """Test the ``reverse_transform`` method. From bb3262e2fa88264dbc8b4d9f497dddd87adee0f2 Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Mon, 26 Aug 2024 09:01:25 -0400 Subject: [PATCH 5/8] Automated Latest Dependency Updates (#873) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- latest_requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/latest_requirements.txt b/latest_requirements.txt index 4d27b67b..be8e161e 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -1,5 +1,5 @@ -Faker==27.0.0 -copulas==0.11.0 +Faker==28.0.0 +copulas==0.11.1 numpy==2.0.1 pandas==2.2.2 scikit-learn==1.5.1 From 50cb70724dfd6e28c39ae473a5dd72cfd3d08786 Mon Sep 17 00:00:00 2001 From: R-Palazzo <116157184+R-Palazzo@users.noreply.github.com> Date: Wed, 28 Aug 2024 08:53:50 +0200 Subject: [PATCH 6/8] FloatFormatter does not round the data correctly for integer columns when using _set_fitted_parameters (#875) --- rdt/transformers/numerical.py | 3 ++- .../transformers/test_numerical.py | 20 +++++++++++++++++++ tests/unit/transformers/test_numerical.py | 1 + 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py index 64a5967f..1425670b 100644 --- a/rdt/transformers/numerical.py +++ b/rdt/transformers/numerical.py @@ -236,8 +236,9 @@ def _set_fitted_parameters( self._min_value = min(min_max_values) self._max_value = max(min_max_values) - if rounding_digits: + if rounding_digits is not None: self._rounding_digits = rounding_digits + self.learn_rounding_scheme = True if self.null_transformer.models_missing_values(): self.output_columns.append(column_name + '.is_null') diff --git a/tests/integration/transformers/test_numerical.py b/tests/integration/transformers/test_numerical.py index 056cb94a..93a78ab6 100644 --- a/tests/integration/transformers/test_numerical.py +++ b/tests/integration/transformers/test_numerical.py @@ -287,6 +287,26 @@ def test__support__nullable_numerical_pandas_dtypes(self): reverse_transformed[column].round(expected_rounding_digits[column]), ) + def test__set_fitted_parameter_rounding_to_integer(self): + """Test the ``_set_fitted_parameters`` method with rounding_digits set to 0.""" + # Setup + data = pd.DataFrame({ + 'col 1': 100 * np.random.random(10), + }) + transformer = FloatFormatter() + + # Run + transformer._set_fitted_parameters( + column_name='col 1', + null_transformer=NullTransformer(), + rounding_digits=0, + dtype='float', + ) + reverse_transformed_data = transformer.reverse_transform(data) + + # Assert + pd.testing.assert_frame_equal(reverse_transformed_data, data.round(0)) + class TestGaussianNormalizer: def test_stats(self): diff --git a/tests/unit/transformers/test_numerical.py b/tests/unit/transformers/test_numerical.py index 08a950ad..0af6b449 100644 --- a/tests/unit/transformers/test_numerical.py +++ b/tests/unit/transformers/test_numerical.py @@ -748,6 +748,7 @@ def test__set_fitted_parameters(self): assert transformer._max_value == 100.0 assert transformer._rounding_digits == rounding_digits assert transformer._dtype == dtype + assert transformer.learn_rounding_scheme is True def test__set_fitted_parameters_from_column(self): """Test ``_set_fitted_parameters`` sets the required parameters for transformer.""" From b16c978972a1cc4b36bc0b0740c961c711934d85 Mon Sep 17 00:00:00 2001 From: SDV Team <98988753+sdv-team@users.noreply.github.com> Date: Tue, 3 Sep 2024 11:25:51 -0400 Subject: [PATCH 7/8] Automated Latest Dependency Updates (#876) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- latest_requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/latest_requirements.txt b/latest_requirements.txt index be8e161e..ca9c1343 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -1,6 +1,6 @@ -Faker==28.0.0 +Faker==28.1.0 copulas==0.11.1 -numpy==2.0.1 +numpy==2.0.2 pandas==2.2.2 scikit-learn==1.5.1 scipy==1.13.1 From e6e1cfd6c1352ebdc41e7b29d0bb066bb9e9cb04 Mon Sep 17 00:00:00 2001 From: Andrew Montanez Date: Thu, 5 Sep 2024 13:18:27 -0500 Subject: [PATCH 8/8] =?UTF-8?q?Bump=20version:=201.12.4.dev0=20=E2=86=92?= =?UTF-8?q?=201.12.4.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- rdt/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8b9a9cb4..1bf22f4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,7 +137,7 @@ collect_ignore = ['pyproject.toml'] exclude_lines = ['NotImplementedError()'] [tool.bumpversion] -current_version = "1.12.4.dev0" +current_version = "1.12.4.dev1" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', diff --git a/rdt/__init__.py b/rdt/__init__.py index a6b78122..bbf0c0c8 100644 --- a/rdt/__init__.py +++ b/rdt/__init__.py @@ -4,7 +4,7 @@ __author__ = 'DataCebo, Inc.' __email__ = 'info@sdv.dev' -__version__ = '1.12.4.dev0' +__version__ = '1.12.4.dev1' import sys