From f28c17ab93bf63b24960f05a285501b7bfee006f Mon Sep 17 00:00:00 2001 From: clee1152 Date: Wed, 2 Aug 2023 11:13:44 -0400 Subject: [PATCH 1/2] fix scipy mend issue (#988) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8532aaabf..7c8aa0b99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ fastavro>=1.0.0.post1 python-snappy>=0.5.4 charset-normalizer>=1.3.6 psutil>=4.0.0 -scipy>=1.4.1 +scipy>=1.10.0 requests>=2.28.1 networkx>=2.5.1 typing-extensions>=3.10.0.2 From da09c1de5db1d192a48a52e8a2ab67990c37fa5f Mon Sep 17 00:00:00 2001 From: clee1152 Date: Wed, 2 Aug 2023 12:00:06 -0400 Subject: [PATCH 2/2] HistogramAndQuantilesOption sync with dev branch (#987) * Changes to HistogramAndQuantilesOption now sync with concurrent updates to dev branch. * Changes to scipy version, fixing comments * Slight docstrings change * revert back -- other PR to fix * empty * fix --- dataprofiler/profilers/__init__.py | 5 +- dataprofiler/profilers/json_decoder.py | 9 + .../profilers/numerical_column_stats.py | 3 +- dataprofiler/profilers/profiler_options.py | 20 +- .../profiler_options/test_float_options.py | 2 +- .../test_histogram_and_quantiles_option.py | 311 ++++++++++++++++++ .../profiler_options/test_histogram_option.py | 190 ----------- .../profiler_options/test_int_options.py | 2 +- .../test_numerical_options.py | 2 +- .../profiler_options/test_text_options.py | 2 +- .../profilers/test_float_column_profile.py | 10 +- .../profilers/test_int_column_profile.py | 1 + .../test_numeric_stats_mixin_profile.py | 1 - 13 files changed, 353 insertions(+), 205 deletions(-) create mode 100644 dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py delete mode 100644 dataprofiler/tests/profilers/profiler_options/test_histogram_option.py diff --git a/dataprofiler/profilers/__init__.py b/dataprofiler/profilers/__init__.py index 64e33e384..4b068fcb0 100644 --- a/dataprofiler/profilers/__init__.py +++ b/dataprofiler/profilers/__init__.py @@ -28,7 +28,7 @@ DataLabelerOptions, DateTimeOptions, FloatOptions, - HistogramOption, + HistogramAndQuantilesOption, HyperLogLogOptions, IntOptions, ModeOption, @@ -66,7 +66,8 @@ json_decoder._options = { BooleanOption.__name__: BooleanOption, - HistogramOption.__name__: HistogramOption, + "HistogramOption": HistogramAndQuantilesOption, + HistogramAndQuantilesOption.__name__: HistogramAndQuantilesOption, ModeOption.__name__: ModeOption, BaseInspectorOptions.__name__: BaseInspectorOptions, NumericalOptions.__name__: NumericalOptions, diff --git a/dataprofiler/profilers/json_decoder.py b/dataprofiler/profilers/json_decoder.py index 16bc2e148..fb4ff8cb9 100644 --- a/dataprofiler/profilers/json_decoder.py +++ b/dataprofiler/profilers/json_decoder.py @@ -1,6 +1,7 @@ """Contains methods to decode components of a Profiler.""" from __future__ import annotations +import warnings from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -72,6 +73,14 @@ def get_option_class(class_name: str) -> type[BaseOption]: options_class: type[BaseOption] | None = _options.get(class_name) if options_class is None: raise ValueError(f"Invalid option class {class_name} " f"failed to load.") + + if class_name == "HistogramOption": + warnings.warn( + f"{class_name} will be deprecated in the future. During the JSON encode" + " process, HistogramOption is mapped to HistogramAndQuantilesOption. " + "Please begin utilizing the new HistogramAndQuantilesOption class.", + DeprecationWarning, + ) return options_class diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index beb23c9e3..2b35c8792 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -82,7 +82,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self._mode_is_enabled: bool = True self.num_zeros: int | np.int64 = np.int64(0) self.num_negatives: int | np.int64 = np.int64(0) - self._num_quantiles: int = 1000 # TODO: add to options + self._num_quantiles: int = 1000 # By default, we use 1000 quantiles if options: self.bias_correction = options.bias_correction.is_enabled @@ -90,6 +90,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self._median_is_enabled = options.median.is_enabled self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled self._mode_is_enabled = options.mode.is_enabled + self._num_quantiles = options.histogram_and_quantiles.num_quantiles bin_count_or_method = options.histogram_and_quantiles.bin_count_or_method if isinstance(bin_count_or_method, str): self.histogram_bin_method_names = [bin_count_or_method] diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 0f2c6a7cc..e3d10696b 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -210,13 +210,14 @@ def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]: return errors -class HistogramOption(BooleanOption["HistogramOption"]): +class HistogramAndQuantilesOption(BooleanOption["HistogramAndQuantilesOption"]): """For setting histogram options.""" def __init__( self, is_enabled: bool = True, bin_count_or_method: str | int | list[str] = "auto", + num_quantiles: int = 1000, ) -> None: """ Initialize Options for histograms. @@ -226,11 +227,16 @@ def __init__( :ivar bin_count_or_method: bin count or the method with which to calculate histograms :vartype bin_count_or_method: Union[str, int, list(str)] + :ivar num_quantiles: number of quantiles + :vartype num_quantiles: int """ self.bin_count_or_method = bin_count_or_method + self.num_quantiles = num_quantiles super().__init__(is_enabled=is_enabled) - def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]: + def _validate_helper( + self, variable_path: str = "HistogramAndQuantilesOption" + ) -> list[str]: """ Validate the options do not conflict and cause errors. @@ -260,6 +266,12 @@ def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]: "than 1, a string, or list of strings from the " "following: {}.".format(variable_path, valid_methods) ) + + if self.num_quantiles is not None and ( + not isinstance(self.num_quantiles, int) or self.num_quantiles < 1 + ): + errors.append(f"{variable_path}.num_quantiles must be a positive integer.") + return errors @@ -396,7 +408,9 @@ def __init__(self) -> None: self.median_abs_deviation: BooleanOption = BooleanOption(is_enabled=True) self.num_zeros: BooleanOption = BooleanOption(is_enabled=True) self.num_negatives: BooleanOption = BooleanOption(is_enabled=True) - self.histogram_and_quantiles: HistogramOption = HistogramOption() + self.histogram_and_quantiles: HistogramAndQuantilesOption = ( + HistogramAndQuantilesOption() + ) # By default, we correct for bias self.bias_correction: BooleanOption = BooleanOption(is_enabled=True) BaseInspectorOptions.__init__(self) diff --git a/dataprofiler/tests/profilers/profiler_options/test_float_options.py b/dataprofiler/tests/profilers/profiler_options/test_float_options.py index 044faa04e..9b67e3534 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_float_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_float_options.py @@ -94,7 +94,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py b/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py new file mode 100644 index 000000000..17abad647 --- /dev/null +++ b/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py @@ -0,0 +1,311 @@ +import json + +from dataprofiler.profilers.json_decoder import load_option +from dataprofiler.profilers.json_encoder import ProfileEncoder +from dataprofiler.profilers.profiler_options import HistogramAndQuantilesOption + +from .. import utils as test_utils +from .test_boolean_option import TestBooleanOption + + +class TestHistogramAndQuantilesOption(TestBooleanOption): + + option_class = HistogramAndQuantilesOption + keys = [] + + def test_init(self): + option = self.get_options() + self.assertTrue(option.is_enabled) + self.assertEqual(option.bin_count_or_method, "auto") + self.assertEqual(option.num_quantiles, 1000) + + def test_set_helper(self): + option = self.get_options() + + # validate, variable path being passed + expected_error = ( + "type object 'test.bin_count_or_method' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option._set_helper({"bin_count_or_method.is_enabled": True}, "test") + + # validate, variable path being passed + expected_error = ( + "type object 'test.num_quantiles' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option._set_helper({"num_quantiles.is_enabled": True}, "test") + + def test_set(self): + option = self.get_options() + + params_to_check = [ + dict(prop="is_enabled", value_list=[False, True]), + dict( + prop="bin_count_or_method", + value_list=[ + None, + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ["sturges", "doane"], + 1, + 10, + 100, + 1000, + 99, + 10000000, + ], + ), + ] + + # this code can be abstracted to limit code everywhere else + # AKA, params_to_check would be the only needed code plus raise errors + def _assert_set_helper(prop, value): + option.set({prop: value}) + self.assertEqual(value, getattr(option, prop), msg=prop) + + for params in params_to_check: + prop, value_list = params["prop"], params["value_list"] + for value in value_list: + _assert_set_helper(prop, value) + + # Treat bin_count_or_method as a BooleanOption + expected_error = ( + "type object 'bin_count_or_method' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option.set({"bin_count_or_method.is_enabled": True}) + + # Treat num_quantiles as a BooleanOption + expected_error = "type object 'num_quantiles' has no attribute 'is_enabled'" + with self.assertRaisesRegex(AttributeError, expected_error): + option.set({"num_quantiles.is_enabled": True}) + + # Test set option for num_quantiles + option.set({"num_quantiles": 50}) + self.assertEqual(option.num_quantiles, 50) + + def test_validate_helper(self): + super().test_validate_helper() + + optpth = self.get_options_path() + + # Default configuration + option = self.get_options(num_quantiles=1000) + self.assertEqual([], option._validate_helper()) + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles + option = self.get_options(num_quantiles="Hello World") + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles may not be zero, must be greater than one(1) + option = self.get_options(num_quantiles=0) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles cannot be a negative integer + option = self.get_options(num_quantiles=-5) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + def test_validate(self): + + super().test_validate() + + optpth = self.get_options_path() + + params_to_check = [ + # non errors + dict(prop="is_enabled", value_list=[False, True], errors=[]), + dict( + prop="bin_count_or_method", + value_list=[ + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ["sturges", "doane"], + 1, + 10, + 100, + 1000, + 99, + 10000000, + ], + errors=[], + ), + # errors + dict( + prop="bin_count_or_method", + value_list=[ + -1, + 1.2, + 1.0, + [], + False, + "whoops", + ["doane", "incorrect"], + "1", + ], + errors=[ + "HistogramAndQuantilesOption.bin_count_or_method must be an integer " + "more than 1, a string, or list of strings from the " + "following: ['auto', 'fd', 'doane', 'scott', 'rice', " + "'sturges', 'sqrt']." + ], + ), + ] + + # this code can be abstracted to limit code everywhere else + # AKA, for loop below could be abstracted to a utils func + + # Default configuration is valid + option = self.get_options() + self.assertIsNone(option.validate(raise_error=False)) + + for params in params_to_check: + prop, value_list, expected_errors = ( + params["prop"], + params["value_list"], + params["errors"], + ) + option = self.get_options() + for value in value_list: + setattr(option, prop, value) + validate_errors = option.validate(raise_error=False) + if expected_errors: + self.assertListEqual( + expected_errors, + validate_errors, + msg=f"Errored for prop: {prop}, value: {value}.", + ) + else: + self.assertIsNone( + validate_errors, + msg=f"Errored for prop: {prop}, value: {value}.", + ) + + # this time testing raising an error + option.bin_count_or_method = "fake method" + expected_error = ( + r"HistogramAndQuantilesOption.bin_count_or_method must be an integer more than " + r"1, a string, or list of strings from the following: " + r"\['auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt']." + ) + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles cannot be a string, must be an int + option = self.get_options(num_quantiles="Hello World") + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles must be a positive integer + option = self.get_options(num_quantiles=0) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles cannot be a negative integer + option = self.get_options(num_quantiles=-5) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + def test_eq(self): + super().test_eq() + + options = self.get_options() + options2 = self.get_options() + options.bin_count_or_method = "sturges" + self.assertNotEqual(options, options2) + options2.bin_count_or_method = "doane" + self.assertNotEqual(options, options2) + options2.bin_count_or_method = "sturges" + self.assertEqual(options, options2) + options.num_quantiles = 30 + self.assertNotEqual(options, options2) + options2.num_quantiles = 50 + self.assertNotEqual(options, options2) + options2.num_quantiles = 30 + self.assertEqual(options, options2) + + def test_json_encode(self): + option = HistogramAndQuantilesOption( + is_enabled=False, bin_count_or_method="doane" + ) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "HistogramAndQuantilesOption", + "data": { + "bin_count_or_method": "doane", + "num_quantiles": 1000, + "is_enabled": False, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) + + def test_json_decode_warn(self): + old_histogram = { + "class": "HistogramOption", + "data": { + "bin_count_or_method": "doane", + "is_enabled": False, + }, + } + + expected = HistogramAndQuantilesOption( + is_enabled=False, bin_count_or_method="doane" + ) + + expected_string = json.dumps(old_histogram, cls=ProfileEncoder) + + expected_warning = ( + "HistogramOption will be deprecated in the future. During the JSON encode " + "process, HistogramOption is mapped to HistogramAndQuantilesOption. " + "Please begin utilizing the new HistogramAndQuantilesOption class." + ) + + with self.assertWarnsRegex(DeprecationWarning, expected_warning): + deserialized = load_option(json.loads(expected_string)) + test_utils.assert_profiles_equal(deserialized, expected) diff --git a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py b/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py deleted file mode 100644 index 4bf3a3b16..000000000 --- a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py +++ /dev/null @@ -1,190 +0,0 @@ -import json - -from dataprofiler.profilers.json_encoder import ProfileEncoder -from dataprofiler.profilers.profiler_options import HistogramOption - -from .test_boolean_option import TestBooleanOption - - -class TestHistogramOption(TestBooleanOption): - - option_class = HistogramOption - keys = [] - - def test_init(self): - option = self.get_options() - self.assertTrue(option.is_enabled) - self.assertEqual(option.bin_count_or_method, "auto") - - def test_set_helper(self): - option = self.get_options() - - # validate, variable path being passed - expected_error = ( - "type object 'test.bin_count_or_method' has no " "attribute 'is_enabled'" - ) - with self.assertRaisesRegex(AttributeError, expected_error): - option._set_helper({"bin_count_or_method.is_enabled": True}, "test") - - def test_set(self): - option = self.get_options() - - params_to_check = [ - dict(prop="is_enabled", value_list=[False, True]), - dict( - prop="bin_count_or_method", - value_list=[ - None, - "auto", - "fd", - "doane", - "scott", - "rice", - "sturges", - "sqrt", - ["sturges", "doane"], - 1, - 10, - 100, - 1000, - 99, - 10000000, - ], - ), - ] - - # this code can be abstracted to limit code everywhere else - # AKA, params_to_check would be the only needed code plus raise errors - def _assert_set_helper(prop, value): - option.set({prop: value}) - self.assertEqual(value, getattr(option, prop), msg=prop) - - for params in params_to_check: - prop, value_list = params["prop"], params["value_list"] - for value in value_list: - _assert_set_helper(prop, value) - - # Treat bin_count_or_method as a BooleanOption - expected_error = ( - "type object 'bin_count_or_method' has no attribute " "'is_enabled'" - ) - with self.assertRaisesRegex(AttributeError, expected_error): - option.set({"bin_count_or_method.is_enabled": True}) - - def test_validate_helper(self): - super().test_validate_helper() - - def test_validate(self): - - super().test_validate() - - params_to_check = [ - # non errors - dict(prop="is_enabled", value_list=[False, True], errors=[]), - dict( - prop="bin_count_or_method", - value_list=[ - "auto", - "fd", - "doane", - "scott", - "rice", - "sturges", - "sqrt", - ["sturges", "doane"], - 1, - 10, - 100, - 1000, - 99, - 10000000, - ], - errors=[], - ), - # errors - dict( - prop="bin_count_or_method", - value_list=[ - -1, - 1.2, - 1.0, - [], - False, - "whoops", - ["doane", "incorrect"], - "1", - ], - errors=[ - "HistogramOption.bin_count_or_method must be an integer " - "more than 1, a string, or list of strings from the " - "following: ['auto', 'fd', 'doane', 'scott', 'rice', " - "'sturges', 'sqrt']." - ], - ), - ] - - # # this code can be abstracted to limit code everywhere else - # # AKA, for loop below could be abstracted to a utils func - - # Default configuration is valid - option = self.get_options() - self.assertIsNone(option.validate(raise_error=False)) - - for params in params_to_check: - prop, value_list, expected_errors = ( - params["prop"], - params["value_list"], - params["errors"], - ) - option = self.get_options() - for value in value_list: - setattr(option, prop, value) - validate_errors = option.validate(raise_error=False) - if expected_errors: - self.assertListEqual( - expected_errors, - validate_errors, - msg=f"Errored for prop: {prop}, value: {value}.", - ) - else: - self.assertIsNone( - validate_errors, - msg=f"Errored for prop: {prop}, value: {value}.", - ) - - # this time testing raising an error - option.bin_count_or_method = "fake method" - expected_error = ( - r"HistogramOption.bin_count_or_method must be an integer more than " - r"1, a string, or list of strings from the following: " - r"\['auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt']." - ) - with self.assertRaisesRegex(ValueError, expected_error): - option.validate() - - def test_eq(self): - super().test_eq() - - options = self.get_options() - options2 = self.get_options() - options.bin_count_or_method = "sturges" - self.assertNotEqual(options, options2) - options2.bin_count_or_method = "doane" - self.assertNotEqual(options, options2) - options2.bin_count_or_method = "sturges" - self.assertEqual(options, options2) - - def test_json_encode(self): - option = HistogramOption(is_enabled=False, bin_count_or_method="doane") - - serialized = json.dumps(option, cls=ProfileEncoder) - - expected = { - "class": "HistogramOption", - "data": { - "bin_count_or_method": "doane", - "is_enabled": False, - }, - } - - self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_int_options.py b/dataprofiler/tests/profilers/profiler_options/test_int_options.py index 317d1ff64..b767f3f3e 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_int_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_int_options.py @@ -86,7 +86,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py index 03d6c01db..ad0833d80 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py @@ -422,7 +422,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_text_options.py b/dataprofiler/tests/profilers/profiler_options/test_text_options.py index 57814126d..b26509e91 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_text_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_text_options.py @@ -128,7 +128,7 @@ def test_json_encode(self): "data": {"is_enabled": False}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 86e721a33..b7a2bfab7 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -37,6 +37,7 @@ def test_base_case(self): self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) self.assertIsNone(profiler.quantiles) + self.assertEqual(profiler._num_quantiles, 1000) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): @@ -1837,9 +1838,10 @@ def test_json_encode_after_update(self, time): data = np.array([0.0, 5.0, 10.0]) df = pd.Series(data).apply(str) - int_options = FloatOptions() - int_options.histogram_and_quantiles.bin_count_or_method = 5 - profiler = FloatColumn("0.0", int_options) + float_options = FloatOptions() + float_options.histogram_and_quantiles.bin_count_or_method = 5 + float_options.histogram_and_quantiles.num_quantiles = 4 + profiler = FloatColumn("0.0", float_options) mocked_quantiles = [0.25, 0.50, 0.75] with mock.patch.object( @@ -1884,7 +1886,7 @@ def test_json_encode_after_update(self, time): "_mode_is_enabled": True, "num_zeros": 1, "num_negatives": 0, - "_num_quantiles": 1000, + "_num_quantiles": 4, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 2.0, diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 01e624d20..d224a57a0 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -37,6 +37,7 @@ def test_base_case(self): self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) self.assertIsNone(profiler.quantiles) + self.assertEqual(profiler._num_quantiles, 1000) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index d01a7c382..4294dfd40 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -420,7 +420,6 @@ def test_from_dict_helper(self): ) expected_profile._stored_histogram = mock_saved_profile["_stored_histogram"] expected_profile.quantiles = None - expected_profile._stored_histogram["histogram"] = { "bin_counts": None, "bin_edges": None,