make release-tag: Merge branch 'main' into stable

sdv-dev · Nov 13, 2024 · e9ef7ba · e9ef7ba
2 parents c179792 + ae687dc
commit e9ef7ba
Show file tree

Hide file tree

Showing 16 changed files with 104 additions and 28 deletions.
diff --git a/.github/workflows/dependency_checker.yml b/.github/workflows/dependency_checker.yml
@@ -16,6 +16,7 @@ jobs:
       run: |
         python -m pip install .[dev]
         make check-deps OUTPUT_FILEPATH=latest_requirements.txt
+        make fix-lint
     - name: Create pull request
       id: cpr
       uses: peter-evans/create-pull-request@v4

diff --git a/.github/workflows/static_code_analysis.yml b/.github/workflows/static_code_analysis.yml
@@ -19,7 +19,7 @@ jobs:
         python -m pip install --upgrade pip
         python -m pip install bandit==1.7.7
     - name: Save code analysis
-      run: bandit -r . -x ./tests -f txt -o static_code_analysis.txt --exit-zero
+      run: bandit -r . -x ./tests,./scripts -f txt -o static_code_analysis.txt --exit-zero
     - name: Create pull request
       id: cpr
       uses: peter-evans/create-pull-request@v4

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,12 @@
 # History
 
+## v1.13.1 - 2024-11-13
+
+### Bugs Fixed
+
+* [PyArrow] `NotImplementedError` when using `FloatFormatter` with numerical data types during fit. - Issue [#886](https://github.com/sdv-dev/RDT/issues/886) by @fealho
+* Trying to `print()` a GaussianNormalizer instance throws an error - Issue [#883](https://github.com/sdv-dev/RDT/issues/883) by @fealho
+
 ## v1.13.0 - 2024-10-08
 
 ### New Features

diff --git a/latest_requirements.txt b/latest_requirements.txt
@@ -1,4 +1,4 @@
-Faker==30.1.0
+Faker==30.8.2
 copulas==0.11.1
 numpy==2.0.2
 pandas==2.2.3

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,8 +48,10 @@ dependencies = [
 rdt = { main = 'rdt.cli.__main__:main' }
 
 [project.optional-dependencies]
-copulas = ['copulas>=0.11.0',]
+copulas = ['copulas>=0.12.0',]
+pyarrow = ['pyarrow>=17.0.0']
 test = [
+    'rdt[pyarrow]',
     'rdt[copulas]',
 
     'pytest>=3.4.2',
@@ -137,7 +139,7 @@ collect_ignore = ['pyproject.toml']
 exclude_lines = ['NotImplementedError()']
 
 [tool.bumpversion]
-current_version = "1.13.0"
+current_version = "1.13.1.dev1"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
@@ -203,10 +205,11 @@ select = [
     # print statements
     "T201",
     # pandas-vet
-    "PD"
+    "PD",
+    # numpy 2.0
+    "NPY201"
 ]
 ignore = [
-    "E501",
     # pydocstyle
     "D107",  # Missing docstring in __init__
     "D417",   # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449

diff --git a/rdt/__init__.py b/rdt/__init__.py
@@ -4,7 +4,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = '[email protected]'
-__version__ = '1.13.0'
+__version__ = '1.13.1.dev1'
 
 
 import sys

diff --git a/rdt/transformers/base.py b/rdt/transformers/base.py
@@ -343,7 +343,7 @@ def __repr__(self):
         instanced = {
             key: getattr(self, key)
             for key in keys
-            if key != 'model_missing_values'  # Remove after deprecation
+            if key != 'model_missing_values' and hasattr(self, key)  # Remove after deprecation
         }
 
         if defaults == instanced:

diff --git a/rdt/transformers/numerical.py b/rdt/transformers/numerical.py
@@ -104,7 +104,7 @@ def _raise_out_of_bounds_error(self, value, name, bound_type, min_bound, max_bou
 
     def _validate_values_within_bounds(self, data):
         if not self.computer_representation.startswith('Float'):
-            fractions = data[~data.isna() & data % 1 != 0]
+            fractions = data[~data.isna() & (data != (data // 1))]
             if not fractions.empty:
                 raise ValueError(
                     f"The column '{data.name}' contains float values {fractions.tolist()}. "

diff --git a/rdt/transformers/pii/anonymizer.py b/rdt/transformers/pii/anonymizer.py
@@ -133,8 +133,7 @@ def __init__(
         self.provider_name = provider_name if provider_name else 'BaseProvider'
         if self.provider_name != 'BaseProvider' and function_name is None:
             raise TransformerInputError(
-                'Please specify the function name to use from the '
-                f"'{self.provider_name}' provider."
+                f"Please specify the function name to use from the '{self.provider_name}' provider."
             )
 
         self.function_name = function_name if function_name else 'lexify'

diff --git a/rdt/transformers/utils.py b/rdt/transformers/utils.py
@@ -254,14 +254,16 @@ def learn_rounding_digits(data):
     """
     # check if data has any decimals
     name = data.name
+    if str(data.dtype).endswith('[pyarrow]'):
+        data = data.to_numpy()
     roundable_data = data[~(np.isinf(data.astype(float)) | pd.isna(data))]
 
     # Doesn't contain numbers
     if len(roundable_data) == 0:
         return None
 
     # Doesn't contain decimal digits
-    if ((roundable_data % 1) == 0).all():
+    if (roundable_data == roundable_data.astype(int)).all():
         return 0
 
     # Try to round to fewer digits

diff --git a/static_code_analysis.txt b/static_code_analysis.txt
@@ -1,10 +1,10 @@
-Run started:2024-09-05 19:41:22.889700
+Run started:2024-10-09 15:39:00.488390
 
 Test results:
 	No issues identified.
 
 Code scanned:
-	Total lines of code: 5543
+	Total lines of code: 5629
 	Total lines skipped (#nosec): 0
 	Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0
 

diff --git a/tests/unit/test_hyper_transformer.py b/tests/unit/test_hyper_transformer.py
@@ -553,7 +553,7 @@ def test_validate_config_not_unique_field(self):
 
         # Run
         error_msg = re.escape(
-            'Error: Invalid config. Please provide unique keys for the sdtypes ' 'and transformers.'
+            'Error: Invalid config. Please provide unique keys for the sdtypes and transformers.'
         )
         with pytest.raises(InvalidConfigError, match=error_msg):
             HyperTransformer._validate_config(config)
@@ -858,8 +858,7 @@ def test_set_config_already_fitted(self, mock_warnings):
 
         # Assert
         expected_warnings_msg = (
-            'For this change to take effect, please refit your data using '
-            "'fit' or 'fit_transform'."
+            "For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
         )
         mock_warnings.warn.assert_called_once_with(expected_warnings_msg)
 
@@ -2511,8 +2510,7 @@ def test_update_transformers_fitted(self, mock_warnings):
 
         # Assert
         expected_message = (
-            "For this change to take effect, please refit your data using 'fit' "
-            "or 'fit_transform'."
+            "For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
         )
 
         mock_warnings.warn.assert_called_once_with(expected_message)
@@ -2921,8 +2919,7 @@ def test_update_sdtypes_fitted(self, mock_warnings, mock_logger):
 
         # Assert
         expected_message = (
-            "For this change to take effect, please refit your data using 'fit' "
-            "or 'fit_transform'."
+            "For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
         )
         user_message = (
             'The transformers for these columns may change based on the new sdtype.\n'
@@ -3470,8 +3467,7 @@ def test_remove_transformers_fitted(self, mock_warnings):
 
         # Assert
         expected_warnings_msg = (
-            'For this change to take effect, please refit your data using '
-            "'fit' or 'fit_transform'."
+            "For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
         )
         mock_warnings.warn.assert_called_once_with(expected_warnings_msg)
         assert ht.field_transformers == {
@@ -3558,8 +3554,7 @@ def test_remove_transformers_by_sdtype(self, mock_warnings):
             'column3': None,
         }
         expected_warnings_msg = (
-            'For this change to take effect, please refit your data using '
-            "'fit' or 'fit_transform'."
+            "For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
         )
         mock_warnings.warn.assert_called_once_with(expected_warnings_msg)
 

diff --git a/tests/unit/transformers/pii/test_anonymizer.py b/tests/unit/transformers/pii/test_anonymizer.py
@@ -425,7 +425,7 @@ def test___init__no_function_name(self):
         """
         # Run / Assert
         expected_message = (
-            'Please specify the function name to use from the ' "'credit_card' provider."
+            "Please specify the function name to use from the 'credit_card' provider."
         )
         with pytest.raises(TransformerInputError, match=expected_message):
             AnonymizedFaker(provider_name='credit_card', locales=['en_US', 'fr_FR'])

diff --git a/tests/unit/transformers/test_base.py b/tests/unit/transformers/test_base.py
@@ -170,7 +170,7 @@ def test_get_input_sdtype_raises_warning(self, mock_get_supported_sdtypes):
 
         # Run
         expected_message = (
-            '`get_input_sdtype` is deprecated. Please use ' '`get_supported_sdtypes` instead.'
+            '`get_input_sdtype` is deprecated. Please use `get_supported_sdtypes` instead.'
         )
         with pytest.warns(FutureWarning, match=expected_message):
             input_sdtype = BaseTransformer.get_input_sdtype()

diff --git a/tests/unit/transformers/test_numerical.py b/tests/unit/transformers/test_numerical.py
@@ -44,6 +44,19 @@ def test__validate_values_within_bounds(self):
         # Run
         transformer._validate_values_within_bounds(data)
 
+    def test__validate_values_within_bounds_pyarrow(self):
+        """Test it works with pyarrow."""
+        # Setup
+        try:
+            data = pd.Series(range(10), dtype='int64[pyarrow]')
+        except TypeError:
+            pytest.skip("Skipping as old numpy/pandas versions don't support arrow")
+        transformer = FloatFormatter()
+        transformer.computer_representation = 'UInt8'
+
+        # Run
+        transformer._validate_values_within_bounds(data)
+
     def test__validate_values_within_bounds_under_minimum(self):
         """Test the ``_validate_values_within_bounds`` method.
 
@@ -1163,6 +1176,18 @@ def test__reverse_transform_missing_value_generation(self):
         # Assert
         np.testing.assert_allclose(transformed_data, expected, rtol=1e-3)
 
+    def test_print(self, capsys):
+        """Test the class can be printed. GH#883"""
+        # Setup
+        transformer = GaussianNormalizer()
+
+        # Run
+        print(transformer)  # noqa: T201 `print` found
+
+        # Assert
+        captured = capsys.readouterr()
+        assert captured.out == 'GaussianNormalizer()\n'
+
 
 class TestClusterBasedNormalizer(TestCase):
     def test__get_current_random_seed_random_states_is_none(self):

diff --git a/tests/unit/transformers/test_utils.py b/tests/unit/transformers/test_utils.py
@@ -1,7 +1,7 @@
 import sre_parse
 import warnings
 from sre_constants import MAXREPEAT
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import numpy as np
 import pandas as pd
@@ -225,6 +225,36 @@ def test_learn_rounding_digits_less_than_15_decimals():
     assert output == 3
 
 
+def test_learn_rounding_digits_pyarrow():
+    """Test it works with pyarrow."""
+    # Setup
+    try:
+        data = pd.Series(range(10), dtype='int64[pyarrow]')
+    except TypeError:
+        pytest.skip("Skipping as old numpy/pandas versions don't support arrow")
+
+    # Run
+    output = learn_rounding_digits(data)
+
+    # Assert
+    assert output == 0
+
+
+def test_learn_rounding_digits_pyarrow_float():
+    """Test it learns the proper amount of digits with pyarrow."""
+    # Setup
+    try:
+        data = pd.Series([0.5, 0.19, 3], dtype='float64[pyarrow]')
+    except TypeError:
+        pytest.skip("Skipping as old numpy/pandas versions don't support arrow")
+
+    # Run
+    output = learn_rounding_digits(data)
+
+    # Assert
+    assert output == 2
+
+
 def test_learn_rounding_digits_negative_decimals_float():
     """Test the learn_rounding_digits method with floats multiples of powers of 10.
 
@@ -299,6 +329,20 @@ def test_learn_rounding_digits_nullable_numerical_pandas_dtypes():
         assert output == expected_output[column]
 
 
+def test_learn_rounding_digits_pyarrow_to_numpy():
+    """Test that ``learn_rounding_digits`` works with pyarrow to numpy conversion."""
+    # Setup
+    data = Mock()
+    data.dtype = 'int64[pyarrow]'
+    data.to_numpy.return_value = np.array([1, 2, 3])
+
+    # Run
+    learn_rounding_digits(data)
+
+    # Assert
+    assert data.to_numpy.called
+
+
 def test_warn_dict():
     """Test that ``WarnDict`` will raise a warning when called with `text`."""
     # Setup