Skip to content

Commit

Permalink
make release-tag: Merge branch 'main' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
amontanez24 committed Nov 13, 2024
2 parents c179792 + ae687dc commit e9ef7ba
Show file tree
Hide file tree
Showing 16 changed files with 104 additions and 28 deletions.
1 change: 1 addition & 0 deletions .github/workflows/dependency_checker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ jobs:
run: |
python -m pip install .[dev]
make check-deps OUTPUT_FILEPATH=latest_requirements.txt
make fix-lint
- name: Create pull request
id: cpr
uses: peter-evans/create-pull-request@v4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/static_code_analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
python -m pip install --upgrade pip
python -m pip install bandit==1.7.7
- name: Save code analysis
run: bandit -r . -x ./tests -f txt -o static_code_analysis.txt --exit-zero
run: bandit -r . -x ./tests,./scripts -f txt -o static_code_analysis.txt --exit-zero
- name: Create pull request
id: cpr
uses: peter-evans/create-pull-request@v4
Expand Down
7 changes: 7 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# History

## v1.13.1 - 2024-11-13

### Bugs Fixed

* [PyArrow] `NotImplementedError` when using `FloatFormatter` with numerical data types during fit. - Issue [#886](https://github.com/sdv-dev/RDT/issues/886) by @fealho
* Trying to `print()` a GaussianNormalizer instance throws an error - Issue [#883](https://github.com/sdv-dev/RDT/issues/883) by @fealho

## v1.13.0 - 2024-10-08

### New Features
Expand Down
2 changes: 1 addition & 1 deletion latest_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Faker==30.1.0
Faker==30.8.2
copulas==0.11.1
numpy==2.0.2
pandas==2.2.3
Expand Down
11 changes: 7 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ dependencies = [
rdt = { main = 'rdt.cli.__main__:main' }

[project.optional-dependencies]
copulas = ['copulas>=0.11.0',]
copulas = ['copulas>=0.12.0',]
pyarrow = ['pyarrow>=17.0.0']
test = [
'rdt[pyarrow]',
'rdt[copulas]',

'pytest>=3.4.2',
Expand Down Expand Up @@ -137,7 +139,7 @@ collect_ignore = ['pyproject.toml']
exclude_lines = ['NotImplementedError()']

[tool.bumpversion]
current_version = "1.13.0"
current_version = "1.13.1.dev1"
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
serialize = [
'{major}.{minor}.{patch}.{release}{candidate}',
Expand Down Expand Up @@ -203,10 +205,11 @@ select = [
# print statements
"T201",
# pandas-vet
"PD"
"PD",
# numpy 2.0
"NPY201"
]
ignore = [
"E501",
# pydocstyle
"D107", # Missing docstring in __init__
"D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
Expand Down
2 changes: 1 addition & 1 deletion rdt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

__author__ = 'DataCebo, Inc.'
__email__ = '[email protected]'
__version__ = '1.13.0'
__version__ = '1.13.1.dev1'


import sys
Expand Down
2 changes: 1 addition & 1 deletion rdt/transformers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def __repr__(self):
instanced = {
key: getattr(self, key)
for key in keys
if key != 'model_missing_values' # Remove after deprecation
if key != 'model_missing_values' and hasattr(self, key) # Remove after deprecation
}

if defaults == instanced:
Expand Down
2 changes: 1 addition & 1 deletion rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _raise_out_of_bounds_error(self, value, name, bound_type, min_bound, max_bou

def _validate_values_within_bounds(self, data):
if not self.computer_representation.startswith('Float'):
fractions = data[~data.isna() & data % 1 != 0]
fractions = data[~data.isna() & (data != (data // 1))]
if not fractions.empty:
raise ValueError(
f"The column '{data.name}' contains float values {fractions.tolist()}. "
Expand Down
3 changes: 1 addition & 2 deletions rdt/transformers/pii/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ def __init__(
self.provider_name = provider_name if provider_name else 'BaseProvider'
if self.provider_name != 'BaseProvider' and function_name is None:
raise TransformerInputError(
'Please specify the function name to use from the '
f"'{self.provider_name}' provider."
f"Please specify the function name to use from the '{self.provider_name}' provider."
)

self.function_name = function_name if function_name else 'lexify'
Expand Down
4 changes: 3 additions & 1 deletion rdt/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,16 @@ def learn_rounding_digits(data):
"""
# check if data has any decimals
name = data.name
if str(data.dtype).endswith('[pyarrow]'):
data = data.to_numpy()
roundable_data = data[~(np.isinf(data.astype(float)) | pd.isna(data))]

# Doesn't contain numbers
if len(roundable_data) == 0:
return None

# Doesn't contain decimal digits
if ((roundable_data % 1) == 0).all():
if (roundable_data == roundable_data.astype(int)).all():
return 0

# Try to round to fewer digits
Expand Down
4 changes: 2 additions & 2 deletions static_code_analysis.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Run started:2024-09-05 19:41:22.889700
Run started:2024-10-09 15:39:00.488390

Test results:
No issues identified.

Code scanned:
Total lines of code: 5543
Total lines of code: 5629
Total lines skipped (#nosec): 0
Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0

Expand Down
17 changes: 6 additions & 11 deletions tests/unit/test_hyper_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ def test_validate_config_not_unique_field(self):

# Run
error_msg = re.escape(
'Error: Invalid config. Please provide unique keys for the sdtypes ' 'and transformers.'
'Error: Invalid config. Please provide unique keys for the sdtypes and transformers.'
)
with pytest.raises(InvalidConfigError, match=error_msg):
HyperTransformer._validate_config(config)
Expand Down Expand Up @@ -858,8 +858,7 @@ def test_set_config_already_fitted(self, mock_warnings):

# Assert
expected_warnings_msg = (
'For this change to take effect, please refit your data using '
"'fit' or 'fit_transform'."
"For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
)
mock_warnings.warn.assert_called_once_with(expected_warnings_msg)

Expand Down Expand Up @@ -2511,8 +2510,7 @@ def test_update_transformers_fitted(self, mock_warnings):

# Assert
expected_message = (
"For this change to take effect, please refit your data using 'fit' "
"or 'fit_transform'."
"For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
)

mock_warnings.warn.assert_called_once_with(expected_message)
Expand Down Expand Up @@ -2921,8 +2919,7 @@ def test_update_sdtypes_fitted(self, mock_warnings, mock_logger):

# Assert
expected_message = (
"For this change to take effect, please refit your data using 'fit' "
"or 'fit_transform'."
"For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
)
user_message = (
'The transformers for these columns may change based on the new sdtype.\n'
Expand Down Expand Up @@ -3470,8 +3467,7 @@ def test_remove_transformers_fitted(self, mock_warnings):

# Assert
expected_warnings_msg = (
'For this change to take effect, please refit your data using '
"'fit' or 'fit_transform'."
"For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
)
mock_warnings.warn.assert_called_once_with(expected_warnings_msg)
assert ht.field_transformers == {
Expand Down Expand Up @@ -3558,8 +3554,7 @@ def test_remove_transformers_by_sdtype(self, mock_warnings):
'column3': None,
}
expected_warnings_msg = (
'For this change to take effect, please refit your data using '
"'fit' or 'fit_transform'."
"For this change to take effect, please refit your data using 'fit' or 'fit_transform'."
)
mock_warnings.warn.assert_called_once_with(expected_warnings_msg)

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/transformers/pii/test_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def test___init__no_function_name(self):
"""
# Run / Assert
expected_message = (
'Please specify the function name to use from the ' "'credit_card' provider."
"Please specify the function name to use from the 'credit_card' provider."
)
with pytest.raises(TransformerInputError, match=expected_message):
AnonymizedFaker(provider_name='credit_card', locales=['en_US', 'fr_FR'])
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/transformers/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_get_input_sdtype_raises_warning(self, mock_get_supported_sdtypes):

# Run
expected_message = (
'`get_input_sdtype` is deprecated. Please use ' '`get_supported_sdtypes` instead.'
'`get_input_sdtype` is deprecated. Please use `get_supported_sdtypes` instead.'
)
with pytest.warns(FutureWarning, match=expected_message):
input_sdtype = BaseTransformer.get_input_sdtype()
Expand Down
25 changes: 25 additions & 0 deletions tests/unit/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,19 @@ def test__validate_values_within_bounds(self):
# Run
transformer._validate_values_within_bounds(data)

def test__validate_values_within_bounds_pyarrow(self):
"""Test it works with pyarrow."""
# Setup
try:
data = pd.Series(range(10), dtype='int64[pyarrow]')
except TypeError:
pytest.skip("Skipping as old numpy/pandas versions don't support arrow")
transformer = FloatFormatter()
transformer.computer_representation = 'UInt8'

# Run
transformer._validate_values_within_bounds(data)

def test__validate_values_within_bounds_under_minimum(self):
"""Test the ``_validate_values_within_bounds`` method.
Expand Down Expand Up @@ -1163,6 +1176,18 @@ def test__reverse_transform_missing_value_generation(self):
# Assert
np.testing.assert_allclose(transformed_data, expected, rtol=1e-3)

def test_print(self, capsys):
"""Test the class can be printed. GH#883"""
# Setup
transformer = GaussianNormalizer()

# Run
print(transformer) # noqa: T201 `print` found

# Assert
captured = capsys.readouterr()
assert captured.out == 'GaussianNormalizer()\n'


class TestClusterBasedNormalizer(TestCase):
def test__get_current_random_seed_random_states_is_none(self):
Expand Down
46 changes: 45 additions & 1 deletion tests/unit/transformers/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sre_parse
import warnings
from sre_constants import MAXREPEAT
from unittest.mock import patch
from unittest.mock import Mock, patch

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -225,6 +225,36 @@ def test_learn_rounding_digits_less_than_15_decimals():
assert output == 3


def test_learn_rounding_digits_pyarrow():
"""Test it works with pyarrow."""
# Setup
try:
data = pd.Series(range(10), dtype='int64[pyarrow]')
except TypeError:
pytest.skip("Skipping as old numpy/pandas versions don't support arrow")

# Run
output = learn_rounding_digits(data)

# Assert
assert output == 0


def test_learn_rounding_digits_pyarrow_float():
"""Test it learns the proper amount of digits with pyarrow."""
# Setup
try:
data = pd.Series([0.5, 0.19, 3], dtype='float64[pyarrow]')
except TypeError:
pytest.skip("Skipping as old numpy/pandas versions don't support arrow")

# Run
output = learn_rounding_digits(data)

# Assert
assert output == 2


def test_learn_rounding_digits_negative_decimals_float():
"""Test the learn_rounding_digits method with floats multiples of powers of 10.
Expand Down Expand Up @@ -299,6 +329,20 @@ def test_learn_rounding_digits_nullable_numerical_pandas_dtypes():
assert output == expected_output[column]


def test_learn_rounding_digits_pyarrow_to_numpy():
"""Test that ``learn_rounding_digits`` works with pyarrow to numpy conversion."""
# Setup
data = Mock()
data.dtype = 'int64[pyarrow]'
data.to_numpy.return_value = np.array([1, 2, 3])

# Run
learn_rounding_digits(data)

# Assert
assert data.to_numpy.called


def test_warn_dict():
"""Test that ``WarnDict`` will raise a warning when called with `text`."""
# Setup
Expand Down

0 comments on commit e9ef7ba

Please sign in to comment.