diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 3e433d85d..58ea61179 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,7 +1,5 @@ """Contains functions for data readers.""" import json -import os -import random import re import urllib from collections import OrderedDict @@ -28,7 +26,7 @@ from chardet.universaldetector import UniversalDetector from typing_extensions import TypeGuard -from .. import dp_logging, settings +from .. import dp_logging, rng_utils from .._typing import JSONType, Url from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA @@ -315,11 +313,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list: kinv = 1 / sample_nrows W = 1.0 - rng = random.Random(x=settings._seed) - if "DATAPROFILER_SEED" in os.environ and settings._seed is None: - seed = os.environ.get("DATAPROFILER_SEED") - if seed: - rng = random.Random(int(seed)) + rng = rng_utils.get_random_number_generator() while True: W *= rng.random() ** kinv @@ -334,7 +328,7 @@ def reservoir(file: TextIOWrapper, sample_nrows: int) -> list: except StopIteration: break # Append new, replace old with dummy, and keep track of order - remove_index = rng.randrange(sample_nrows) + remove_index = rng.integers(0, sample_nrows) values[indices[remove_index]] = str(None) indices[remove_index] = len(values) values.append(newval) @@ -824,7 +818,6 @@ def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO: "Content-length" in url.headers and int(url.headers["Content-length"]) >= 1024**3 ): - raise ValueError( "The downloaded file from the url may not be " "larger than 1GB" ) diff --git a/dataprofiler/profilers/__init__.py b/dataprofiler/profilers/__init__.py index 64e33e384..4b068fcb0 100644 --- a/dataprofiler/profilers/__init__.py +++ b/dataprofiler/profilers/__init__.py @@ -28,7 +28,7 @@ DataLabelerOptions, DateTimeOptions, FloatOptions, - HistogramOption, + HistogramAndQuantilesOption, HyperLogLogOptions, IntOptions, ModeOption, @@ -66,7 +66,8 @@ json_decoder._options = { BooleanOption.__name__: BooleanOption, - HistogramOption.__name__: HistogramOption, + "HistogramOption": HistogramAndQuantilesOption, + HistogramAndQuantilesOption.__name__: HistogramAndQuantilesOption, ModeOption.__name__: ModeOption, BaseInspectorOptions.__name__: BaseInspectorOptions, NumericalOptions.__name__: NumericalOptions, diff --git a/dataprofiler/profilers/base_column_profilers.py b/dataprofiler/profilers/base_column_profilers.py index d9c183c99..1ef5b75fe 100644 --- a/dataprofiler/profilers/base_column_profilers.py +++ b/dataprofiler/profilers/base_column_profilers.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .profiler_options import BaseInspectorOptions, BaseOption BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler") @@ -76,7 +76,7 @@ def _timeit(method: Callable = None, name: str = None) -> Callable: :param name: key argument for the times dictionary :type name: str """ - return utils.method_timeit(method, name) + return profiler_utils.method_timeit(method, name) @staticmethod def _filter_properties_w_options( @@ -173,7 +173,7 @@ def _add_helper( else: raise ValueError(f"Column names unmatched: {other1.name} != {other2.name}") - self.times = utils.add_nested_dictionaries(other1.times, other2.times) + self.times = profiler_utils.add_nested_dictionaries(other1.times, other2.times) self.sample_size = other1.sample_size + other2.sample_size diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index caaf3778e..1376cc38e 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -8,7 +8,7 @@ import datasketches from pandas import DataFrame, Series -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import CategoricalOptions @@ -131,7 +131,7 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn: elif not self.cms and not other.cms: # If both profiles have not met stop condition if not (self._stop_condition_is_met or other._stop_condition_is_met): - merged_profile._categories = utils.add_nested_dictionaries( + merged_profile._categories = profiler_utils.add_nested_dictionaries( self._categories, other._categories ) @@ -250,7 +250,7 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: # Make sure other_profile's type matches this class differences: dict = super().diff(other_profile, options) - differences["categorical"] = utils.find_diff_of_strings_and_bools( + differences["categorical"] = profiler_utils.find_diff_of_strings_and_bools( self.is_match, other_profile.is_match ) @@ -258,13 +258,13 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: [ ( "unique_count", - utils.find_diff_of_numbers( + profiler_utils.find_diff_of_numbers( self.unique_count, other_profile.unique_count ), ), ( "unique_ratio", - utils.find_diff_of_numbers( + profiler_utils.find_diff_of_numbers( self.unique_ratio, other_profile.unique_ratio ), ), @@ -275,19 +275,25 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: if self.is_match and other_profile.is_match: differences["statistics"][ "chi2-test" - ] = utils.perform_chi_squared_test_for_homogeneity( + ] = profiler_utils.perform_chi_squared_test_for_homogeneity( self._categories, self.sample_size, other_profile._categories, other_profile.sample_size, ) - differences["statistics"]["categories"] = utils.find_diff_of_lists_and_sets( + differences["statistics"][ + "categories" + ] = profiler_utils.find_diff_of_lists_and_sets( self.categories, other_profile.categories ) - differences["statistics"]["gini_impurity"] = utils.find_diff_of_numbers( + differences["statistics"][ + "gini_impurity" + ] = profiler_utils.find_diff_of_numbers( self.gini_impurity, other_profile.gini_impurity ) - differences["statistics"]["unalikeability"] = utils.find_diff_of_numbers( + differences["statistics"][ + "unalikeability" + ] = profiler_utils.find_diff_of_numbers( self.unalikeability, other_profile.unalikeability ) cat_count1 = dict( @@ -299,9 +305,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: ) ) - differences["statistics"]["categorical_count"] = utils.find_diff_of_dicts( - cat_count1, cat_count2 - ) + differences["statistics"][ + "categorical_count" + ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2) return differences @@ -532,7 +538,7 @@ def _merge_categories_cms( for k in (x for x in heavy_hitter_dict2 if x not in heavy_hitter_dict1): heavy_hitter_dict1[k] = cms1.get_estimate(k) - categories = utils.add_nested_dictionaries( + categories = profiler_utils.add_nested_dictionaries( heavy_hitter_dict2, heavy_hitter_dict1 ) @@ -604,7 +610,7 @@ def _update_categories( ) else: category_count = self._get_categories_full(df_series) - self._categories = utils.add_nested_dictionaries( + self._categories = profiler_utils.add_nested_dictionaries( self._categories, category_count ) self._update_stop_condition(df_series) diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py index e3a8ecb16..07edf13dc 100644 --- a/dataprofiler/profilers/column_profile_compilers.py +++ b/dataprofiler/profilers/column_profile_compilers.py @@ -8,7 +8,7 @@ from pandas import Series -from . import utils +from . import profiler_utils from .categorical_column_profile import CategoricalColumn from .data_labeler_column_profile import DataLabelerColumn from .datetime_column_profile import DateTimeColumn @@ -106,7 +106,7 @@ def _create_profile( df_series.name, options=profiler_options ) except Exception as e: - utils.warn_on_profile(profiler.type, e) + profiler_utils.warn_on_profile(profiler.type, e) # Update profile after creation self.update_profile(df_series, pool) @@ -338,7 +338,7 @@ def diff( if all_profiles: for key in all_profiles: if key in self._profiles and key in other._profiles: - diff = utils.find_diff_of_numbers( + diff = profiler_utils.find_diff_of_numbers( self._profiles[key].data_type_ratio, other._profiles[key].data_type_ratio, ) @@ -352,7 +352,7 @@ def diff( data_type1 = self.selected_data_type data_type2 = other.selected_data_type if data_type1 is not None or data_type2 is not None: - diff_profile["data_type"] = utils.find_diff_of_strings_and_bools( + diff_profile["data_type"] = profiler_utils.find_diff_of_strings_and_bools( data_type1, data_type2 ) # Find diff of matching profile statistics diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index 9487278d6..d9bfe1ee9 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -9,7 +9,7 @@ from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import DataLabelerOptions @@ -325,7 +325,7 @@ def load_from_dict(cls, data, config: dict | None = None) -> DataLabelerColumn: data_labeler_load_attr = data.pop("data_labeler") if data_labeler_load_attr: - data_labeler_object = utils.reload_labeler_from_options_or_get_new( + data_labeler_object = profiler_utils.reload_labeler_from_options_or_get_new( data_labeler_load_attr, config ) if data_labeler_object is not None: @@ -379,9 +379,13 @@ def diff(self, other_profile: DataLabelerColumn, options: dict = None) -> dict: other_label_rep = other_profile.label_representation differences = { - "data_label": utils.find_diff_of_lists_and_sets(self_labels, other_labels), - "avg_predictions": utils.find_diff_of_dicts(avg_preds, other_avg_preds), - "label_representation": utils.find_diff_of_dicts( + "data_label": profiler_utils.find_diff_of_lists_and_sets( + self_labels, other_labels + ), + "avg_predictions": profiler_utils.find_diff_of_dicts( + avg_preds, other_avg_preds + ), + "label_representation": profiler_utils.find_diff_of_dicts( label_rep, other_label_rep ), } diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py index fc7801dd3..af99283a9 100644 --- a/dataprofiler/profilers/datetime_column_profile.py +++ b/dataprofiler/profilers/datetime_column_profile.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .profiler_options import DateTimeOptions @@ -114,7 +114,7 @@ def __add__(self, other: DateTimeColumn) -> DateTimeColumn: merged_profile.max = other.max merged_profile._dt_obj_max = other._dt_obj_max - merged_profile.date_formats = utils._combine_unique_sets( + merged_profile.date_formats = profiler_utils._combine_unique_sets( self.date_formats, other.date_formats ) return merged_profile @@ -192,13 +192,13 @@ def diff(self, other_profile: DateTimeColumn, options: dict = None) -> dict: super().diff(other_profile, options) differences = { - "min": utils.find_diff_of_dates( + "min": profiler_utils.find_diff_of_dates( self._dt_obj_min, other_profile._dt_obj_min ), - "max": utils.find_diff_of_dates( + "max": profiler_utils.find_diff_of_dates( self._dt_obj_max, other_profile._dt_obj_max ), - "format": utils.find_diff_of_lists_and_sets( + "format": profiler_utils.find_diff_of_lists_and_sets( self.date_formats, other_profile.date_formats ), } diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index b816f221c..bc426a447 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .numerical_column_stats import NumericStatsMixin from .profiler_options import FloatOptions @@ -137,7 +137,7 @@ def diff(self, other_profile: FloatColumn, options: dict = None) -> dict: other_precision = other_profile.profile["precision"] precision_diff = dict() for key in self.profile["precision"].keys(): - precision_diff[key] = utils.find_diff_of_numbers( + precision_diff[key] = profiler_utils.find_diff_of_numbers( self.profile["precision"][key], other_precision[key] ) precision_diff.pop("confidence_level") diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py index ecb5d63f6..0680a29a7 100644 --- a/dataprofiler/profilers/graph_profiler.py +++ b/dataprofiler/profilers/graph_profiler.py @@ -14,7 +14,7 @@ from packaging import version from ..data_readers.graph_data import GraphData -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import ProfilerOptions @@ -118,34 +118,34 @@ def diff(self, other_profile: GraphProfiler, options: dict = None) -> dict: ) diff_profile = { - "num_nodes": utils.find_diff_of_numbers( + "num_nodes": profiler_utils.find_diff_of_numbers( self._num_nodes, other_profile._num_nodes ), - "num_edges": utils.find_diff_of_numbers( + "num_edges": profiler_utils.find_diff_of_numbers( self._num_edges, other_profile._num_edges ), - "categorical_attributes": utils.find_diff_of_lists_and_sets( + "categorical_attributes": profiler_utils.find_diff_of_lists_and_sets( self._categorical_attributes, other_profile._categorical_attributes ), - "continuous_attributes": utils.find_diff_of_lists_and_sets( + "continuous_attributes": profiler_utils.find_diff_of_lists_and_sets( self._continuous_attributes, other_profile._continuous_attributes ), - "avg_node_degree": utils.find_diff_of_numbers( + "avg_node_degree": profiler_utils.find_diff_of_numbers( self._avg_node_degree, other_profile._avg_node_degree ), - "global_max_component_size": utils.find_diff_of_numbers( + "global_max_component_size": profiler_utils.find_diff_of_numbers( self._global_max_component_size, other_profile._global_max_component_size, ), - "continuous_distribution": utils.find_diff_of_dicts_with_diff_keys( + "continuous_distribution": profiler_utils.find_diff_of_dicts_with_diff_keys( self._continuous_distribution, other_profile._continuous_distribution, ), - "categorical_distribution": utils.find_diff_of_dicts_with_diff_keys( + "categorical_distribution": profiler_utils.find_diff_of_dicts_with_diff_keys( # noqa: E501 self._categorical_distribution, other_profile._categorical_distribution, ), - "times": utils.find_diff_of_dicts(self.times, other_profile.times), + "times": profiler_utils.find_diff_of_dicts(self.times, other_profile.times), } return diff_profile diff --git a/dataprofiler/profilers/json_decoder.py b/dataprofiler/profilers/json_decoder.py index 16bc2e148..fb4ff8cb9 100644 --- a/dataprofiler/profilers/json_decoder.py +++ b/dataprofiler/profilers/json_decoder.py @@ -1,6 +1,7 @@ """Contains methods to decode components of a Profiler.""" from __future__ import annotations +import warnings from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -72,6 +73,14 @@ def get_option_class(class_name: str) -> type[BaseOption]: options_class: type[BaseOption] | None = _options.get(class_name) if options_class is None: raise ValueError(f"Invalid option class {class_name} " f"failed to load.") + + if class_name == "HistogramOption": + warnings.warn( + f"{class_name} will be deprecated in the future. During the JSON encode" + " process, HistogramOption is mapped to HistogramAndQuantilesOption. " + "Please begin utilizing the new HistogramAndQuantilesOption class.", + DeprecationWarning, + ) return options_class diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 707e916db..2b35c8792 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -13,7 +13,7 @@ import pandas as pd import scipy.stats -from . import histogram_utils, utils +from . import histogram_utils, profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import NumericalOptions @@ -82,7 +82,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self._mode_is_enabled: bool = True self.num_zeros: int | np.int64 = np.int64(0) self.num_negatives: int | np.int64 = np.int64(0) - self._num_quantiles: int = 1000 # TODO: add to options + self._num_quantiles: int = 1000 # By default, we use 1000 quantiles if options: self.bias_correction = options.bias_correction.is_enabled @@ -90,6 +90,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self._median_is_enabled = options.median.is_enabled self._median_abs_dev_is_enabled = options.median_abs_deviation.is_enabled self._mode_is_enabled = options.mode.is_enabled + self._num_quantiles = options.histogram_and_quantiles.num_quantiles bin_count_or_method = options.histogram_and_quantiles.bin_count_or_method if isinstance(bin_count_or_method, str): self.histogram_bin_method_names = [bin_count_or_method] @@ -497,20 +498,26 @@ def diff( ) differences = { - "min": utils.find_diff_of_numbers(self.min, other_profile.min), - "max": utils.find_diff_of_numbers(self.max, other_profile.max), - "sum": utils.find_diff_of_numbers(self.sum, other_profile.sum), - "mean": utils.find_diff_of_numbers(self.mean, other_profile.mean), - "median": utils.find_diff_of_numbers(self.median, other_profile.median), - "mode": utils.find_diff_of_lists_and_sets(self.mode, other_profile.mode), - "median_absolute_deviation": utils.find_diff_of_numbers( + "min": profiler_utils.find_diff_of_numbers(self.min, other_profile.min), + "max": profiler_utils.find_diff_of_numbers(self.max, other_profile.max), + "sum": profiler_utils.find_diff_of_numbers(self.sum, other_profile.sum), + "mean": profiler_utils.find_diff_of_numbers(self.mean, other_profile.mean), + "median": profiler_utils.find_diff_of_numbers( + self.median, other_profile.median + ), + "mode": profiler_utils.find_diff_of_lists_and_sets( + self.mode, other_profile.mode + ), + "median_absolute_deviation": profiler_utils.find_diff_of_numbers( self.median_abs_deviation, other_profile.median_abs_deviation, ), - "variance": utils.find_diff_of_numbers( + "variance": profiler_utils.find_diff_of_numbers( self.variance, other_profile.variance ), - "stddev": utils.find_diff_of_numbers(self.stddev, other_profile.stddev), + "stddev": profiler_utils.find_diff_of_numbers( + self.stddev, other_profile.stddev + ), "t-test": self._perform_t_test( self.mean, self.variance, @@ -1844,7 +1851,7 @@ def _get_skewness( ): return - batch_biased_skewness = utils.biased_skew(df_series) + batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] @@ -1888,7 +1895,7 @@ def _get_kurtosis( ): return - batch_biased_kurtosis = utils.biased_kurt(df_series) + batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py index c6a369d8d..308262324 100644 --- a/dataprofiler/profilers/order_column_profile.py +++ b/dataprofiler/profilers/order_column_profile.py @@ -7,7 +7,7 @@ import numpy as np from pandas import DataFrame, Series -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import OrderOptions @@ -362,7 +362,7 @@ def diff(self, other_profile: OrderColumn, options: dict = None) -> dict: super().diff(other_profile, options) differences = { - "order": utils.find_diff_of_strings_and_bools( + "order": profiler_utils.find_diff_of_strings_and_bools( self.order, other_profile.order ) } diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index fc2a2246e..113d19ef2 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -5,7 +5,6 @@ import copy import json import logging -import os import pickle import random import re @@ -20,11 +19,11 @@ import pandas as pd from HLL import HyperLogLog -from .. import data_readers, dp_logging, settings +from .. import data_readers, dp_logging, rng_utils from ..data_readers.data import Data from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler -from . import utils +from . import profiler_utils from .column_profile_compilers import ( BaseCompiler, ColumnDataLabelerCompiler, @@ -271,7 +270,7 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di comp_diff = self.profiles[key].diff( other_profile.profiles[key], options=options ) - unordered_profile = utils.recursive_dict_update( + unordered_profile = profiler_utils.recursive_dict_update( unordered_profile, comp_diff ) @@ -287,16 +286,16 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di unordered_profile["statistics"].update( { - "sample_size": utils.find_diff_of_numbers( + "sample_size": profiler_utils.find_diff_of_numbers( self.sample_size, other_profile.sample_size ), - "null_count": utils.find_diff_of_numbers( + "null_count": profiler_utils.find_diff_of_numbers( self.null_count, other_profile.null_count ), - "null_types": utils.find_diff_of_lists_and_sets( + "null_types": profiler_utils.find_diff_of_lists_and_sets( self.null_types, other_profile.null_types ), - "null_types_index": utils.find_diff_of_dicts_with_diff_keys( + "null_types_index": profiler_utils.find_diff_of_dicts_with_diff_keys( self.null_types_index, other_profile.null_types_index ), } @@ -337,7 +336,7 @@ def report(self, remove_disabled_flag: bool = False) -> OrderedDict: """Return profile.""" unordered_profile: dict = dict() for profile in self.profiles.values(): - unordered_profile = utils.recursive_dict_update( + unordered_profile = profiler_utils.recursive_dict_update( unordered_profile, profile.report(remove_disabled_flag) ) @@ -429,7 +428,7 @@ def _update_base_stats(self, base_stats: dict) -> None: self._last_batch_size = base_stats["sample_size"] self.sample = base_stats["sample"] self.null_count += base_stats["null_count"] - self.null_types = utils._combine_unique_sets( + self.null_types = profiler_utils._combine_unique_sets( self.null_types, list(base_stats["null_types"].keys()) ) @@ -438,7 +437,7 @@ def _update_base_stats(self, base_stats: dict) -> None: base_nti = base_stats["null_types"] # Check if indices overlap, if they do, adjust attributes accordingly - if utils.overlap(self._min_id, self._max_id, base_min, base_max): + if profiler_utils.overlap(self._min_id, self._max_id, base_min, base_max): warnings.warn( f"Overlapping indices detected. To resolve, indices " f"where null data present will be shifted forward " @@ -602,11 +601,11 @@ def clean_data_and_get_base_stats( # Select generator depending if sample_ids availability if sample_ids is None: - sample_ind_generator = utils.shuffle_in_chunks( + sample_ind_generator = profiler_utils.shuffle_in_chunks( len_df, chunk_size=sample_size ) else: - sample_ind_generator = utils.partition( + sample_ind_generator = profiler_utils.partition( sample_ids[0], chunk_size=sample_size ) @@ -654,14 +653,7 @@ def clean_data_and_get_base_stats( df_series = df_series.loc[true_sample_list] total_na = total_sample_size - len(true_sample_list) - rng = np.random.default_rng(settings._seed) - - if "DATAPROFILER_SEED" in os.environ and settings._seed is None: - seed = os.environ.get("DATAPROFILER_SEED") - if isinstance(seed, int): - rng = np.random.default_rng(int(seed)) - else: - warnings.warn("Seed should be an integer", RuntimeWarning) + rng = rng_utils.get_random_number_generator() base_stats = { "sample_size": total_sample_size, @@ -755,7 +747,7 @@ def __init__( self.options.set({"data_labeler.data_labeler_object": data_labeler}) except Exception as e: - utils.warn_on_profile("data_labeler", e) + profiler_utils.warn_on_profile("data_labeler", e) self.options.set({"data_labeler.is_enabled": False}) def _add_error_checks(self, other: BaseProfiler) -> None: @@ -801,7 +793,9 @@ def __add__(self, other: BaseProfiler) -> BaseProfiler: merged_profile.total_samples = self.total_samples + other.total_samples - merged_profile.times = utils.add_nested_dictionaries(self.times, other.times) + merged_profile.times = profiler_utils.add_nested_dictionaries( + self.times, other.times + ) return merged_profile @@ -826,10 +820,10 @@ def diff(self, other_profile: BaseProfiler, options: dict = None) -> dict: ( "global_stats", { - "file_type": utils.find_diff_of_strings_and_bools( + "file_type": profiler_utils.find_diff_of_strings_and_bools( self.file_type, other_profile.file_type ), - "encoding": utils.find_diff_of_strings_and_bools( + "encoding": profiler_utils.find_diff_of_strings_and_bools( self.encoding, other_profile.encoding ), }, @@ -1080,7 +1074,7 @@ def _restore_data_labelers(self, data_labeler: BaseDataLabeler = None) -> None: self.options.set({"data_labeler.data_labeler_object": data_labeler}) except Exception as e: - utils.warn_on_profile("data_labeler", e) + profiler_utils.warn_on_profile("data_labeler", e) self.options.set({"data_labeler.is_enabled": False}) self.options.set({"data_labeler.data_labeler_object": data_labeler}) @@ -1334,13 +1328,13 @@ def diff( # type: ignore[override] report["global_stats"].update( { - "samples_used": utils.find_diff_of_numbers( + "samples_used": profiler_utils.find_diff_of_numbers( self.total_samples, other_profile.total_samples ), - "empty_line_count": utils.find_diff_of_numbers( + "empty_line_count": profiler_utils.find_diff_of_numbers( self._empty_line_count, other_profile._empty_line_count ), - "memory_size": utils.find_diff_of_numbers( + "memory_size": profiler_utils.find_diff_of_numbers( self.memory_size, other_profile.memory_size ), } @@ -1444,7 +1438,7 @@ def load_from_dict( """ raise NotImplementedError("UnstructuredProfiler deserialization not supported.") - @utils.method_timeit(name="clean_and_base_stats") + @profiler_utils.method_timeit(name="clean_and_base_stats") def _clean_data_and_get_base_stats( self, data: pd.Series, sample_size: int, min_true_samples: int = None ) -> tuple[pd.Series, dict]: @@ -1481,10 +1475,14 @@ def _clean_data_and_get_base_stats( data = data.apply(str) # get memory size - base_stats: dict = {"memory_size": utils.get_memory_size(data, unit="M")} + base_stats: dict = { + "memory_size": profiler_utils.get_memory_size(data, unit="M") + } # Setup sample generator - sample_ind_generator = utils.shuffle_in_chunks(len_data, chunk_size=sample_size) + sample_ind_generator = profiler_utils.shuffle_in_chunks( + len_data, chunk_size=sample_size + ) true_sample_list = set() total_sample_size = 0 @@ -1869,34 +1867,34 @@ def diff( # type: ignore[override] report = super().diff(other_profile, options) report["global_stats"].update( { - "samples_used": utils.find_diff_of_numbers( + "samples_used": profiler_utils.find_diff_of_numbers( self._max_col_samples_used, other_profile._max_col_samples_used ), - "column_count": utils.find_diff_of_numbers( + "column_count": profiler_utils.find_diff_of_numbers( len(self._profile), len(other_profile._profile) ), - "row_count": utils.find_diff_of_numbers( + "row_count": profiler_utils.find_diff_of_numbers( self.total_samples, other_profile.total_samples ), - "row_has_null_ratio": utils.find_diff_of_numbers( + "row_has_null_ratio": profiler_utils.find_diff_of_numbers( self._get_row_has_null_ratio(), other_profile._get_row_has_null_ratio(), ), - "row_is_null_ratio": utils.find_diff_of_numbers( + "row_is_null_ratio": profiler_utils.find_diff_of_numbers( self._get_row_is_null_ratio(), other_profile._get_row_is_null_ratio(), ), - "unique_row_ratio": utils.find_diff_of_numbers( + "unique_row_ratio": profiler_utils.find_diff_of_numbers( self._get_unique_row_ratio(), other_profile._get_unique_row_ratio() ), - "duplicate_row_count": utils.find_diff_of_numbers( + "duplicate_row_count": profiler_utils.find_diff_of_numbers( self._get_duplicate_row_count(), other_profile._get_duplicate_row_count(), ), - "correlation_matrix": utils.find_diff_of_matrices( + "correlation_matrix": profiler_utils.find_diff_of_matrices( self.correlation_matrix, other_profile.correlation_matrix ), - "chi2_matrix": utils.find_diff_of_matrices( + "chi2_matrix": profiler_utils.find_diff_of_matrices( self.chi2_matrix, other_profile.chi2_matrix ), "profile_schema": defaultdict(list), @@ -1916,7 +1914,7 @@ def diff( # type: ignore[override] report["global_stats"][ "profile_schema" - ] = utils.find_diff_of_dicts_with_diff_keys( + ] = profiler_utils.find_diff_of_dicts_with_diff_keys( self_profile_schema, other_profile_schema ) @@ -2193,7 +2191,7 @@ def _get_duplicate_row_count(self) -> int | None: ) return 0 - @utils.method_timeit(name="row_stats") + @profiler_utils.method_timeit(name="row_stats") def _update_row_statistics( self, data: pd.DataFrame, sample_ids: list[int] = None ) -> None: @@ -2347,7 +2345,7 @@ def _get_correlation( return corr_mat - @utils.method_timeit(name="correlation") + @profiler_utils.method_timeit(name="correlation") def _update_correlation( self, clean_samples: dict, prev_dependent_properties: dict ) -> None: @@ -2371,7 +2369,7 @@ def _update_correlation( batch_properties["count"], ) - @utils.method_timeit(name="correlation") + @profiler_utils.method_timeit(name="correlation") def _merge_correlation(self, other: StructuredProfiler) -> pd.DataFrame: """ Merge correlation matrix from two profiles. @@ -2570,7 +2568,7 @@ def _update_chi2(self) -> np.ndarray: if not profiler2.is_match: continue - results = utils.perform_chi_squared_test_for_homogeneity( + results = profiler_utils.perform_chi_squared_test_for_homogeneity( profiler1.categorical_counts, profiler1.sample_size, profiler2.categorical_counts, @@ -2834,7 +2832,7 @@ def tqdm(level: set[int]) -> Generator[int, None, None]: yield e # Shuffle indices once and share with columns - sample_ids = [*utils.shuffle_in_chunks(len(data), len(data))] + sample_ids = [*profiler_utils.shuffle_in_chunks(len(data), len(data))] # If there are no minimum true samples, you can sort to save time if min_true_samples in [None, 0]: @@ -2869,12 +2867,17 @@ def tqdm(level: set[int]) -> Generator[int, None, None]: ) ) + # If options.multiprocess is enabled, auto-toggle multiprocessing + auto_multiprocess_toggle = False + if self.options.multiprocess.is_enabled: + auto_multiprocess_toggle = profiler_utils.auto_multiprocess_toggle(data) + # Generate pool and estimate datasize pool = None - if self.options.multiprocess.is_enabled: + if auto_multiprocess_toggle: est_data_size = data[:50000].memory_usage(index=False, deep=True).sum() est_data_size = (est_data_size / min(50000, len(data))) * len(data) - pool, pool_size = utils.generate_pool( + pool, pool_size = profiler_utils.generate_pool( max_pool_size=None, data_size=est_data_size, cols=len(data.columns) ) @@ -2992,8 +2995,8 @@ def tqdm(level: set[int]) -> Generator[int, None, None]: # Process and label the data notification_str = "Calculating the statistics... " pool = None - if self.options.multiprocess.is_enabled: - pool, pool_size = utils.generate_pool(4, est_data_size) + if auto_multiprocess_toggle: + pool, pool_size = profiler_utils.generate_pool(4, est_data_size) if pool: notification_str += " (with " + str(pool_size) + " processes)" diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index f34876a55..e3d10696b 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -9,7 +9,7 @@ from typing import Any, Generic, TypeVar, cast from ..labelers.base_data_labeler import BaseDataLabeler -from . import utils +from . import profiler_utils from .json_decoder import load_option BaseOptionT = TypeVar("BaseOptionT", bound="BaseOption") @@ -210,13 +210,14 @@ def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]: return errors -class HistogramOption(BooleanOption["HistogramOption"]): +class HistogramAndQuantilesOption(BooleanOption["HistogramAndQuantilesOption"]): """For setting histogram options.""" def __init__( self, is_enabled: bool = True, bin_count_or_method: str | int | list[str] = "auto", + num_quantiles: int = 1000, ) -> None: """ Initialize Options for histograms. @@ -226,11 +227,16 @@ def __init__( :ivar bin_count_or_method: bin count or the method with which to calculate histograms :vartype bin_count_or_method: Union[str, int, list(str)] + :ivar num_quantiles: number of quantiles + :vartype num_quantiles: int """ self.bin_count_or_method = bin_count_or_method + self.num_quantiles = num_quantiles super().__init__(is_enabled=is_enabled) - def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]: + def _validate_helper( + self, variable_path: str = "HistogramAndQuantilesOption" + ) -> list[str]: """ Validate the options do not conflict and cause errors. @@ -260,6 +266,12 @@ def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]: "than 1, a string, or list of strings from the " "following: {}.".format(variable_path, valid_methods) ) + + if self.num_quantiles is not None and ( + not isinstance(self.num_quantiles, int) or self.num_quantiles < 1 + ): + errors.append(f"{variable_path}.num_quantiles must be a positive integer.") + return errors @@ -396,7 +408,9 @@ def __init__(self) -> None: self.median_abs_deviation: BooleanOption = BooleanOption(is_enabled=True) self.num_zeros: BooleanOption = BooleanOption(is_enabled=True) self.num_negatives: BooleanOption = BooleanOption(is_enabled=True) - self.histogram_and_quantiles: HistogramOption = HistogramOption() + self.histogram_and_quantiles: HistogramAndQuantilesOption = ( + HistogramAndQuantilesOption() + ) # By default, we correct for bias self.bias_correction: BooleanOption = BooleanOption(is_enabled=True) BaseInspectorOptions.__init__(self) @@ -1308,7 +1322,7 @@ def load_from_dict( data_labeler_object = None data_labeler_load_attr = data.pop("data_labeler_object", {}) if data_labeler_load_attr: - data_labeler_object = utils.reload_labeler_from_options_or_get_new( + data_labeler_object = profiler_utils.reload_labeler_from_options_or_get_new( data_labeler_load_attr, config ) if data_labeler_object: diff --git a/dataprofiler/profilers/utils.py b/dataprofiler/profilers/profiler_utils.py similarity index 96% rename from dataprofiler/profilers/utils.py rename to dataprofiler/profilers/profiler_utils.py index 09bfbac18..a3ed375b4 100644 --- a/dataprofiler/profilers/utils.py +++ b/dataprofiler/profilers/profiler_utils.py @@ -7,7 +7,6 @@ import functools import math import multiprocessing as mp -import os import time import warnings from abc import abstractmethod @@ -31,13 +30,14 @@ import scipy from pandas import DataFrame, Series -from .. import settings from ..labelers.data_labelers import DataLabeler if TYPE_CHECKING: from ..labelers.base_data_labeler import BaseDataLabeler from . import profile_builder +from .. import rng_utils + def recursive_dict_update(d: dict, update_d: dict) -> dict: """ @@ -109,14 +109,7 @@ def shuffle_in_chunks( if not data_length or data_length == 0 or not chunk_size or chunk_size == 0: return [] - rng = np.random.default_rng(settings._seed) - - if "DATAPROFILER_SEED" in os.environ and settings._seed is None: - seed = os.environ.get("DATAPROFILER_SEED") - if isinstance(seed, int): - rng = np.random.default_rng(int(seed)) - else: - warnings.warn("Seed should be an integer", RuntimeWarning) + rng = rng_utils.get_random_number_generator() indices = KeyDict() j = 0 @@ -184,6 +177,34 @@ def partition(data: list, chunk_size: int) -> Generator[list, None, Any]: yield data[idx : idx + chunk_size] +def auto_multiprocess_toggle( + data: DataFrame, + num_rows_threshold: int = 750000, + num_cols_threshold: int = 20, +) -> bool: + """ + Automate multiprocessing toggle depending on dataset sizes. + + :param data: a dataset + :type data: pandas.DataFrame + :param num_rows_threshold: threshold for number of rows to + use multiprocess + :type num_rows_threshold: int + :param num_cols_threshold: threshold for number of columns + to use multiprocess + :type num_cols_threshold: int + :return: recommended option.multiprocess.is_enabled value + :rtype: bool + """ + # If the number of rows or columns exceed their respective threshold, + # we want to turn on multiprocessing + if data.shape[0] > num_rows_threshold or data.shape[1] > num_cols_threshold: + return True + # Otherwise, we do not turn on multiprocessing + else: + return False + + def suggest_pool_size(data_size: int = None, cols: int = None) -> int | None: """ Suggest the pool size based on resources. diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index e8446dcb8..bea8dbd68 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler from .numerical_column_stats import NumericStatsMixin from .profiler_options import TextOptions @@ -111,7 +111,9 @@ def diff(self, other_profile: TextColumn, options: dict = None) -> dict: differences = NumericStatsMixin.diff(self, other_profile, options) del differences["psi"] - vocab_diff = utils.find_diff_of_lists_and_sets(self.vocab, other_profile.vocab) + vocab_diff = profiler_utils.find_diff_of_lists_and_sets( + self.vocab, other_profile.vocab + ) differences["vocab"] = vocab_diff return differences @@ -149,7 +151,7 @@ def _update_vocab( :return: None """ data_flat = set(itertools.chain(*data)) - self.vocab = utils._combine_unique_sets(self.vocab, data_flat) + self.vocab = profiler_utils._combine_unique_sets(self.vocab, data_flat) def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: """ diff --git a/dataprofiler/profilers/unstructured_labeler_profile.py b/dataprofiler/profilers/unstructured_labeler_profile.py index d07f2647d..1c7b16c0f 100644 --- a/dataprofiler/profilers/unstructured_labeler_profile.py +++ b/dataprofiler/profilers/unstructured_labeler_profile.py @@ -8,7 +8,7 @@ from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler from ..labelers.data_processing import CharPostprocessor -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import DataLabelerOptions @@ -87,14 +87,16 @@ def __add__(self, other: UnstructuredLabelerProfile) -> UnstructuredLabelerProfi options.data_labeler_object = self.data_labeler merged_profile = UnstructuredLabelerProfile(options=options) - merged_profile.entity_counts = utils.add_nested_dictionaries( + merged_profile.entity_counts = profiler_utils.add_nested_dictionaries( self.entity_counts, other.entity_counts ) merged_profile.char_sample_size = self.char_sample_size + other.char_sample_size merged_profile.word_sample_size = self.word_sample_size + other.word_sample_size - merged_profile.times = utils.add_nested_dictionaries(self.times, other.times) + merged_profile.times = profiler_utils.add_nested_dictionaries( + self.times, other.times + ) merged_profile._update_percentages() @@ -133,10 +135,10 @@ def diff( entity_counts_diff = {} entity_percentages_diff = {} for key in ["word_level", "true_char_level", "postprocess_char_level"]: - entity_percentages_diff[key] = utils.find_diff_of_dicts( + entity_percentages_diff[key] = profiler_utils.find_diff_of_dicts( self.entity_percentages[key], other_profile.entity_percentages[key] ) - entity_counts_diff[key] = utils.find_diff_of_dicts( + entity_counts_diff[key] = profiler_utils.find_diff_of_dicts( self.entity_counts[key], other_profile.entity_counts[key] ) diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index ffc9eb503..96b7d0625 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -9,7 +9,7 @@ from numpy import ndarray from pandas import DataFrame, Series -from . import utils +from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import TextProfilerOptions @@ -610,18 +610,20 @@ def diff(self, other_profile: TextProfiler, options: dict = None) -> dict: other_word_count = {k.lower(): v for k, v in other_word_count.items()} diff: dict = {} - diff["vocab"] = utils.find_diff_of_lists_and_sets( + diff["vocab"] = profiler_utils.find_diff_of_lists_and_sets( list(self.vocab_count.keys()), list(other_profile.vocab_count.keys()) ) - diff["vocab_count"] = utils.find_diff_of_dicts_with_diff_keys( + diff["vocab_count"] = profiler_utils.find_diff_of_dicts_with_diff_keys( dict(self.vocab_count.most_common(self._top_k_chars)), dict(other_profile.vocab_count.most_common(self._top_k_chars)), ) - diff["words"] = utils.find_diff_of_lists_and_sets(self_words, other_words) + diff["words"] = profiler_utils.find_diff_of_lists_and_sets( + self_words, other_words + ) - diff["word_count"] = utils.find_diff_of_dicts_with_diff_keys( + diff["word_count"] = profiler_utils.find_diff_of_dicts_with_diff_keys( self_word_count, other_word_count ) diff --git a/dataprofiler/rng_utils.py b/dataprofiler/rng_utils.py new file mode 100644 index 000000000..329066658 --- /dev/null +++ b/dataprofiler/rng_utils.py @@ -0,0 +1,19 @@ +"""Create a random number generator using a manual seed DATAPROFILER_SEED.""" +import os +import warnings + +import numpy as np + +from . import settings + + +def get_random_number_generator() -> np.random._generator.Generator: + """Create a random number generator using a manual seed DATAPROFILER_SEED.""" + rng = np.random.default_rng(settings._seed) + if "DATAPROFILER_SEED" in os.environ and settings._seed is None: + seed: str = os.environ.get("DATAPROFILER_SEED", "") + try: + rng = np.random.default_rng(int(seed)) + except ValueError: + warnings.warn("Seed should be an integer", RuntimeWarning) + return rng diff --git a/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py b/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py index ef906e084..e2794c78a 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py @@ -196,7 +196,7 @@ def test_json_encode(self): self.assertDictEqual(expected, json.loads(serialized)) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode(self, mock_BaseDataLabeler): diff --git a/dataprofiler/tests/profilers/profiler_options/test_float_options.py b/dataprofiler/tests/profilers/profiler_options/test_float_options.py index 044faa04e..9b67e3534 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_float_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_float_options.py @@ -94,7 +94,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py b/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py new file mode 100644 index 000000000..17abad647 --- /dev/null +++ b/dataprofiler/tests/profilers/profiler_options/test_histogram_and_quantiles_option.py @@ -0,0 +1,311 @@ +import json + +from dataprofiler.profilers.json_decoder import load_option +from dataprofiler.profilers.json_encoder import ProfileEncoder +from dataprofiler.profilers.profiler_options import HistogramAndQuantilesOption + +from .. import utils as test_utils +from .test_boolean_option import TestBooleanOption + + +class TestHistogramAndQuantilesOption(TestBooleanOption): + + option_class = HistogramAndQuantilesOption + keys = [] + + def test_init(self): + option = self.get_options() + self.assertTrue(option.is_enabled) + self.assertEqual(option.bin_count_or_method, "auto") + self.assertEqual(option.num_quantiles, 1000) + + def test_set_helper(self): + option = self.get_options() + + # validate, variable path being passed + expected_error = ( + "type object 'test.bin_count_or_method' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option._set_helper({"bin_count_or_method.is_enabled": True}, "test") + + # validate, variable path being passed + expected_error = ( + "type object 'test.num_quantiles' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option._set_helper({"num_quantiles.is_enabled": True}, "test") + + def test_set(self): + option = self.get_options() + + params_to_check = [ + dict(prop="is_enabled", value_list=[False, True]), + dict( + prop="bin_count_or_method", + value_list=[ + None, + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ["sturges", "doane"], + 1, + 10, + 100, + 1000, + 99, + 10000000, + ], + ), + ] + + # this code can be abstracted to limit code everywhere else + # AKA, params_to_check would be the only needed code plus raise errors + def _assert_set_helper(prop, value): + option.set({prop: value}) + self.assertEqual(value, getattr(option, prop), msg=prop) + + for params in params_to_check: + prop, value_list = params["prop"], params["value_list"] + for value in value_list: + _assert_set_helper(prop, value) + + # Treat bin_count_or_method as a BooleanOption + expected_error = ( + "type object 'bin_count_or_method' has no attribute 'is_enabled'" + ) + with self.assertRaisesRegex(AttributeError, expected_error): + option.set({"bin_count_or_method.is_enabled": True}) + + # Treat num_quantiles as a BooleanOption + expected_error = "type object 'num_quantiles' has no attribute 'is_enabled'" + with self.assertRaisesRegex(AttributeError, expected_error): + option.set({"num_quantiles.is_enabled": True}) + + # Test set option for num_quantiles + option.set({"num_quantiles": 50}) + self.assertEqual(option.num_quantiles, 50) + + def test_validate_helper(self): + super().test_validate_helper() + + optpth = self.get_options_path() + + # Default configuration + option = self.get_options(num_quantiles=1000) + self.assertEqual([], option._validate_helper()) + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles + option = self.get_options(num_quantiles="Hello World") + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles may not be zero, must be greater than one(1) + option = self.get_options(num_quantiles=0) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + # Option num_quantiles cannot be a negative integer + option = self.get_options(num_quantiles=-5) + expected_error = [f"{optpth}.num_quantiles must be a positive integer."] + self.assertSetEqual(set(expected_error), set(option._validate_helper())) + + def test_validate(self): + + super().test_validate() + + optpth = self.get_options_path() + + params_to_check = [ + # non errors + dict(prop="is_enabled", value_list=[False, True], errors=[]), + dict( + prop="bin_count_or_method", + value_list=[ + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ["sturges", "doane"], + 1, + 10, + 100, + 1000, + 99, + 10000000, + ], + errors=[], + ), + # errors + dict( + prop="bin_count_or_method", + value_list=[ + -1, + 1.2, + 1.0, + [], + False, + "whoops", + ["doane", "incorrect"], + "1", + ], + errors=[ + "HistogramAndQuantilesOption.bin_count_or_method must be an integer " + "more than 1, a string, or list of strings from the " + "following: ['auto', 'fd', 'doane', 'scott', 'rice', " + "'sturges', 'sqrt']." + ], + ), + ] + + # this code can be abstracted to limit code everywhere else + # AKA, for loop below could be abstracted to a utils func + + # Default configuration is valid + option = self.get_options() + self.assertIsNone(option.validate(raise_error=False)) + + for params in params_to_check: + prop, value_list, expected_errors = ( + params["prop"], + params["value_list"], + params["errors"], + ) + option = self.get_options() + for value in value_list: + setattr(option, prop, value) + validate_errors = option.validate(raise_error=False) + if expected_errors: + self.assertListEqual( + expected_errors, + validate_errors, + msg=f"Errored for prop: {prop}, value: {value}.", + ) + else: + self.assertIsNone( + validate_errors, + msg=f"Errored for prop: {prop}, value: {value}.", + ) + + # this time testing raising an error + option.bin_count_or_method = "fake method" + expected_error = ( + r"HistogramAndQuantilesOption.bin_count_or_method must be an integer more than " + r"1, a string, or list of strings from the following: " + r"\['auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt']." + ) + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Valid configurations + option = self.get_options(num_quantiles=50) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=2000) + self.assertEqual([], option._validate_helper()) + option = self.get_options(num_quantiles=1) + self.assertEqual([], option._validate_helper()) + + # Option num_quantiles cannot be a string, must be an int + option = self.get_options(num_quantiles="Hello World") + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles cannot be a float, must be an int + option = self.get_options(num_quantiles=1.1) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles must be a positive integer + option = self.get_options(num_quantiles=0) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + # Option num_quantiles cannot be a negative integer + option = self.get_options(num_quantiles=-5) + expected_error = f"{optpth}.num_quantiles must be a positive integer" + with self.assertRaisesRegex(ValueError, expected_error): + option.validate() + + def test_eq(self): + super().test_eq() + + options = self.get_options() + options2 = self.get_options() + options.bin_count_or_method = "sturges" + self.assertNotEqual(options, options2) + options2.bin_count_or_method = "doane" + self.assertNotEqual(options, options2) + options2.bin_count_or_method = "sturges" + self.assertEqual(options, options2) + options.num_quantiles = 30 + self.assertNotEqual(options, options2) + options2.num_quantiles = 50 + self.assertNotEqual(options, options2) + options2.num_quantiles = 30 + self.assertEqual(options, options2) + + def test_json_encode(self): + option = HistogramAndQuantilesOption( + is_enabled=False, bin_count_or_method="doane" + ) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "HistogramAndQuantilesOption", + "data": { + "bin_count_or_method": "doane", + "num_quantiles": 1000, + "is_enabled": False, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) + + def test_json_decode_warn(self): + old_histogram = { + "class": "HistogramOption", + "data": { + "bin_count_or_method": "doane", + "is_enabled": False, + }, + } + + expected = HistogramAndQuantilesOption( + is_enabled=False, bin_count_or_method="doane" + ) + + expected_string = json.dumps(old_histogram, cls=ProfileEncoder) + + expected_warning = ( + "HistogramOption will be deprecated in the future. During the JSON encode " + "process, HistogramOption is mapped to HistogramAndQuantilesOption. " + "Please begin utilizing the new HistogramAndQuantilesOption class." + ) + + with self.assertWarnsRegex(DeprecationWarning, expected_warning): + deserialized = load_option(json.loads(expected_string)) + test_utils.assert_profiles_equal(deserialized, expected) diff --git a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py b/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py deleted file mode 100644 index 4bf3a3b16..000000000 --- a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py +++ /dev/null @@ -1,190 +0,0 @@ -import json - -from dataprofiler.profilers.json_encoder import ProfileEncoder -from dataprofiler.profilers.profiler_options import HistogramOption - -from .test_boolean_option import TestBooleanOption - - -class TestHistogramOption(TestBooleanOption): - - option_class = HistogramOption - keys = [] - - def test_init(self): - option = self.get_options() - self.assertTrue(option.is_enabled) - self.assertEqual(option.bin_count_or_method, "auto") - - def test_set_helper(self): - option = self.get_options() - - # validate, variable path being passed - expected_error = ( - "type object 'test.bin_count_or_method' has no " "attribute 'is_enabled'" - ) - with self.assertRaisesRegex(AttributeError, expected_error): - option._set_helper({"bin_count_or_method.is_enabled": True}, "test") - - def test_set(self): - option = self.get_options() - - params_to_check = [ - dict(prop="is_enabled", value_list=[False, True]), - dict( - prop="bin_count_or_method", - value_list=[ - None, - "auto", - "fd", - "doane", - "scott", - "rice", - "sturges", - "sqrt", - ["sturges", "doane"], - 1, - 10, - 100, - 1000, - 99, - 10000000, - ], - ), - ] - - # this code can be abstracted to limit code everywhere else - # AKA, params_to_check would be the only needed code plus raise errors - def _assert_set_helper(prop, value): - option.set({prop: value}) - self.assertEqual(value, getattr(option, prop), msg=prop) - - for params in params_to_check: - prop, value_list = params["prop"], params["value_list"] - for value in value_list: - _assert_set_helper(prop, value) - - # Treat bin_count_or_method as a BooleanOption - expected_error = ( - "type object 'bin_count_or_method' has no attribute " "'is_enabled'" - ) - with self.assertRaisesRegex(AttributeError, expected_error): - option.set({"bin_count_or_method.is_enabled": True}) - - def test_validate_helper(self): - super().test_validate_helper() - - def test_validate(self): - - super().test_validate() - - params_to_check = [ - # non errors - dict(prop="is_enabled", value_list=[False, True], errors=[]), - dict( - prop="bin_count_or_method", - value_list=[ - "auto", - "fd", - "doane", - "scott", - "rice", - "sturges", - "sqrt", - ["sturges", "doane"], - 1, - 10, - 100, - 1000, - 99, - 10000000, - ], - errors=[], - ), - # errors - dict( - prop="bin_count_or_method", - value_list=[ - -1, - 1.2, - 1.0, - [], - False, - "whoops", - ["doane", "incorrect"], - "1", - ], - errors=[ - "HistogramOption.bin_count_or_method must be an integer " - "more than 1, a string, or list of strings from the " - "following: ['auto', 'fd', 'doane', 'scott', 'rice', " - "'sturges', 'sqrt']." - ], - ), - ] - - # # this code can be abstracted to limit code everywhere else - # # AKA, for loop below could be abstracted to a utils func - - # Default configuration is valid - option = self.get_options() - self.assertIsNone(option.validate(raise_error=False)) - - for params in params_to_check: - prop, value_list, expected_errors = ( - params["prop"], - params["value_list"], - params["errors"], - ) - option = self.get_options() - for value in value_list: - setattr(option, prop, value) - validate_errors = option.validate(raise_error=False) - if expected_errors: - self.assertListEqual( - expected_errors, - validate_errors, - msg=f"Errored for prop: {prop}, value: {value}.", - ) - else: - self.assertIsNone( - validate_errors, - msg=f"Errored for prop: {prop}, value: {value}.", - ) - - # this time testing raising an error - option.bin_count_or_method = "fake method" - expected_error = ( - r"HistogramOption.bin_count_or_method must be an integer more than " - r"1, a string, or list of strings from the following: " - r"\['auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt']." - ) - with self.assertRaisesRegex(ValueError, expected_error): - option.validate() - - def test_eq(self): - super().test_eq() - - options = self.get_options() - options2 = self.get_options() - options.bin_count_or_method = "sturges" - self.assertNotEqual(options, options2) - options2.bin_count_or_method = "doane" - self.assertNotEqual(options, options2) - options2.bin_count_or_method = "sturges" - self.assertEqual(options, options2) - - def test_json_encode(self): - option = HistogramOption(is_enabled=False, bin_count_or_method="doane") - - serialized = json.dumps(option, cls=ProfileEncoder) - - expected = { - "class": "HistogramOption", - "data": { - "bin_count_or_method": "doane", - "is_enabled": False, - }, - } - - self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_int_options.py b/dataprofiler/tests/profilers/profiler_options/test_int_options.py index 317d1ff64..b767f3f3e 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_int_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_int_options.py @@ -86,7 +86,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py index 03d6c01db..ad0833d80 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py @@ -422,7 +422,7 @@ def test_json_encode(self): "data": {"is_enabled": True}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/profiler_options/test_text_options.py b/dataprofiler/tests/profilers/profiler_options/test_text_options.py index 57814126d..b26509e91 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_text_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_text_options.py @@ -128,7 +128,7 @@ def test_json_encode(self): "data": {"is_enabled": False}, }, "histogram_and_quantiles": { - "class": "HistogramOption", + "class": "HistogramAndQuantilesOption", "data": mock.ANY, }, "bias_correction": { diff --git a/dataprofiler/tests/profilers/test_base_column_profilers.py b/dataprofiler/tests/profilers/test_base_column_profilers.py index eb2ed764d..4ab7182cf 100644 --- a/dataprofiler/tests/profilers/test_base_column_profilers.py +++ b/dataprofiler/tests/profilers/test_base_column_profilers.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils from dataprofiler.profilers.base_column_profilers import ( BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler, @@ -185,7 +185,7 @@ def test_cannot_instantiate(self): def test_combine_unqiue_sets(self): a = [1, 2, 3] b = [3, 1, 4, -1] - c = utils._combine_unique_sets(a, b) + c = profiler_utils._combine_unique_sets(a, b) self.assertCountEqual([1, 2, 3, 4, -1], c) def test__init__(self): diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 957bb694f..35617d1e8 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -611,7 +611,7 @@ def test_json_decode_after_update(self): @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) @mock.patch( diff --git a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py index 5b25f939c..35f448aea 100644 --- a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py +++ b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py @@ -7,7 +7,7 @@ import pandas as pd from dataprofiler.labelers import BaseDataLabeler -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils from dataprofiler.profilers.data_labeler_column_profile import DataLabelerColumn from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder @@ -376,7 +376,7 @@ def test_diff(self, mock_instance): diff = profiler1.diff(profiler2) expected_diff = { - "data_label": utils.find_diff_of_lists_and_sets( + "data_label": profiler_utils.find_diff_of_lists_and_sets( ["a", "b", "c"], ["b", "c", "d"] ), "avg_predictions": {"a": "unchanged", "b": -0.70, "c": 0.70}, @@ -485,7 +485,9 @@ def test_json_encode_after_update(self, mock_instance): self.assertEqual(expected, serialized) - @mock.patch("dataprofiler.profilers.utils.DataLabeler", spec=BaseDataLabeler) + @mock.patch( + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler + ) def test_json_decode(self, mock_utils_DataLabeler, mock_BaseDataLabeler): self._setup_data_labeler_mock(mock_BaseDataLabeler) mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler @@ -526,7 +528,9 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_BaseDataLabeler): class_as_dict["data"]["data_labeler"] = {"from_disk": "test"} deserialized = load_column_profile(class_as_dict, config) - @mock.patch("dataprofiler.profilers.utils.DataLabeler", spec=BaseDataLabeler) + @mock.patch( + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler + ) def test_json_decode_after_update( self, mock_utils_DataLabeler, mock_BaseDataLabeler ): diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 86e721a33..b7a2bfab7 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -37,6 +37,7 @@ def test_base_case(self): self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) self.assertIsNone(profiler.quantiles) + self.assertEqual(profiler._num_quantiles, 1000) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): @@ -1837,9 +1838,10 @@ def test_json_encode_after_update(self, time): data = np.array([0.0, 5.0, 10.0]) df = pd.Series(data).apply(str) - int_options = FloatOptions() - int_options.histogram_and_quantiles.bin_count_or_method = 5 - profiler = FloatColumn("0.0", int_options) + float_options = FloatOptions() + float_options.histogram_and_quantiles.bin_count_or_method = 5 + float_options.histogram_and_quantiles.num_quantiles = 4 + profiler = FloatColumn("0.0", float_options) mocked_quantiles = [0.25, 0.50, 0.75] with mock.patch.object( @@ -1884,7 +1886,7 @@ def test_json_encode_after_update(self, time): "_mode_is_enabled": True, "num_zeros": 1, "num_negatives": 0, - "_num_quantiles": 1000, + "_num_quantiles": 4, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 2.0, diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 01e624d20..d224a57a0 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -37,6 +37,7 @@ def test_base_case(self): self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) self.assertIsNone(profiler.quantiles) + self.assertEqual(profiler._num_quantiles, 1000) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index d01a7c382..4294dfd40 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -420,7 +420,6 @@ def test_from_dict_helper(self): ) expected_profile._stored_histogram = mock_saved_profile["_stored_histogram"] expected_profile.quantiles = None - expected_profile._stored_histogram["histogram"] = { "bin_counts": None, "bin_edges": None, diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index a4326492f..9507fe20f 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -1587,7 +1587,7 @@ def test_save_and_load_json_file(self): save_profile.save(save_method="json") mock_file.seek(0) with mock.patch( - "dataprofiler.profilers.utils.DataLabeler.load_from_library", + "dataprofiler.profilers.profiler_utils.DataLabeler.load_from_library", return_value=data_labeler, ): load_profile = dp.StructuredProfiler.load("mock.json", "JSON") @@ -2509,7 +2509,7 @@ def test_json_encode_after_update(self, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): @@ -2543,7 +2543,7 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode_after_update( @@ -3213,7 +3213,7 @@ def test_json_encode_after_update(self, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): @@ -3234,7 +3234,7 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): spec=BaseDataLabeler, ) @mock.patch( - "dataprofiler.profilers.utils.DataLabeler", + "dataprofiler.profilers.profiler_utils.DataLabeler", spec=BaseDataLabeler, ) def test_json_decode_after_update( diff --git a/dataprofiler/tests/profilers/test_utils.py b/dataprofiler/tests/profilers/test_profiler_utils.py similarity index 67% rename from dataprofiler/tests/profilers/test_utils.py rename to dataprofiler/tests/profilers/test_profiler_utils.py index 10350b2b2..4eee1963a 100644 --- a/dataprofiler/tests/profilers/test_utils.py +++ b/dataprofiler/tests/profilers/test_profiler_utils.py @@ -7,26 +7,26 @@ import dataprofiler as dp from dataprofiler.labelers.base_data_labeler import BaseDataLabeler -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils class TestShuffleInChunks(unittest.TestCase): """ - Validates utils.shuffle_in_chunks is properly working. + Validates profiler_utils.shuffle_in_chunks is properly working. """ def test_full_sample(self): """ Check if can shuffle full sample. """ - sample = next(utils.shuffle_in_chunks(data_length=10, chunk_size=10)) + sample = next(profiler_utils.shuffle_in_chunks(data_length=10, chunk_size=10)) self.assertCountEqual(sample, list(range(10))) def test_even_chunk_sample(self): """ Check if can shuffle sample where chunk size is evenly divisible. """ - sample_gen = utils.shuffle_in_chunks(data_length=12, chunk_size=3) + sample_gen = profiler_utils.shuffle_in_chunks(data_length=12, chunk_size=3) all_values = set() num_chunks = 0 @@ -41,7 +41,7 @@ def test_uneven_chunk_sample(self): """ Check if can shuffle sample where chunk size is not evenly divisible. """ - sample_gen = utils.shuffle_in_chunks(data_length=100, chunk_size=7) + sample_gen = profiler_utils.shuffle_in_chunks(data_length=100, chunk_size=7) all_values = set() num_chunks = 0 @@ -60,50 +60,65 @@ def test_find_diff(self): # Ensure lists and sets are handled appropriately self.assertEqual( - [[], [3, 2], [2]], utils.find_diff_of_lists_and_sets([3, 2], [2, 3, 2]) + [[], [3, 2], [2]], + profiler_utils.find_diff_of_lists_and_sets([3, 2], [2, 3, 2]), ) self.assertEqual( - [[1], [2, 3], [4]], utils.find_diff_of_lists_and_sets([1, 2, 3], [2, 3, 4]) + [[1], [2, 3], [4]], + profiler_utils.find_diff_of_lists_and_sets([1, 2, 3], [2, 3, 4]), ) - self.assertEqual("unchanged", utils.find_diff_of_lists_and_sets({3, 2}, {2, 3})) self.assertEqual( - [[1], [2, 3], [4]], utils.find_diff_of_lists_and_sets({1, 2, 3}, {2, 3, 4}) + "unchanged", profiler_utils.find_diff_of_lists_and_sets({3, 2}, {2, 3}) ) - self.assertEqual("unchanged", utils.find_diff_of_lists_and_sets({2, 3}, [2, 3])) self.assertEqual( - [[1], [2, 3], [4]], utils.find_diff_of_lists_and_sets([1, 2, 3], {2, 3, 4}) + [[1], [2, 3], [4]], + profiler_utils.find_diff_of_lists_and_sets({1, 2, 3}, {2, 3, 4}), ) self.assertEqual( - [None, {1, 2}], utils.find_diff_of_lists_and_sets(None, {1, 2}) + "unchanged", profiler_utils.find_diff_of_lists_and_sets({2, 3}, [2, 3]) + ) + self.assertEqual( + [[1], [2, 3], [4]], + profiler_utils.find_diff_of_lists_and_sets([1, 2, 3], {2, 3, 4}), + ) + self.assertEqual( + [None, {1, 2}], profiler_utils.find_diff_of_lists_and_sets(None, {1, 2}) + ) + self.assertEqual( + "unchanged", profiler_utils.find_diff_of_lists_and_sets(None, None) ) - self.assertEqual("unchanged", utils.find_diff_of_lists_and_sets(None, None)) # Ensure ints and floats are handled appropriately - self.assertEqual(1, utils.find_diff_of_numbers(5, 4)) - self.assertEqual(1.0, utils.find_diff_of_numbers(5.0, 4.0)) - self.assertEqual(1.0, utils.find_diff_of_numbers(5.0, 4)) - self.assertEqual("unchanged", utils.find_diff_of_numbers(5.0, 5.0)) - self.assertEqual("unchanged", utils.find_diff_of_numbers(5, 5.0)) - self.assertEqual([4, None], utils.find_diff_of_numbers(4, None)) - self.assertEqual("unchanged", utils.find_diff_of_numbers(None, None)) + self.assertEqual(1, profiler_utils.find_diff_of_numbers(5, 4)) + self.assertEqual(1.0, profiler_utils.find_diff_of_numbers(5.0, 4.0)) + self.assertEqual(1.0, profiler_utils.find_diff_of_numbers(5.0, 4)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_numbers(5.0, 5.0)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_numbers(5, 5.0)) + self.assertEqual([4, None], profiler_utils.find_diff_of_numbers(4, None)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_numbers(None, None)) # Ensure strings are handled appropriately self.assertEqual( - "unchanged", utils.find_diff_of_strings_and_bools("Hello", "Hello") + "unchanged", profiler_utils.find_diff_of_strings_and_bools("Hello", "Hello") + ) + self.assertEqual( + ["Hello", "team"], + profiler_utils.find_diff_of_strings_and_bools("Hello", "team"), ) self.assertEqual( - ["Hello", "team"], utils.find_diff_of_strings_and_bools("Hello", "team") + "unchanged", profiler_utils.find_diff_of_strings_and_bools(None, None) ) - self.assertEqual("unchanged", utils.find_diff_of_strings_and_bools(None, None)) # Ensure dates are handled appropriately a = datetime(2021, 6, 28) b = datetime(2021, 6, 27, 1) - self.assertEqual("unchanged", utils.find_diff_of_dates(a, a)) - self.assertEqual("+23:00:00", utils.find_diff_of_dates(a, b)) - self.assertEqual("-23:00:00", utils.find_diff_of_dates(b, a)) - self.assertEqual(["06/28/21 00:00:00", None], utils.find_diff_of_dates(a, None)) - self.assertEqual("unchanged", utils.find_diff_of_dates(None, None)) + self.assertEqual("unchanged", profiler_utils.find_diff_of_dates(a, a)) + self.assertEqual("+23:00:00", profiler_utils.find_diff_of_dates(a, b)) + self.assertEqual("-23:00:00", profiler_utils.find_diff_of_dates(b, a)) + self.assertEqual( + ["06/28/21 00:00:00", None], profiler_utils.find_diff_of_dates(a, None) + ) + self.assertEqual("unchanged", profiler_utils.find_diff_of_dates(None, None)) # Ensure that differencing dictionaries is handled appropriately dict1 = { @@ -131,7 +146,9 @@ def test_find_diff(self): "f": ["hi2", None], "g": [None, 15], } - self.assertDictEqual(expected_diff, utils.find_diff_of_dicts(dict1, dict2)) + self.assertDictEqual( + expected_diff, profiler_utils.find_diff_of_dicts(dict1, dict2) + ) dict1 = { "nested_key_one": {"fruit": ["apple", "banana", "orange"], "yes_no": False}, @@ -167,7 +184,9 @@ def test_find_diff(self): "additional_key": [None, "random_string"], } - self.assertDictEqual(expected_diff, utils.find_diff_of_dicts(dict1, dict2)) + self.assertDictEqual( + expected_diff, profiler_utils.find_diff_of_dicts(dict1, dict2) + ) def test_diff_of_dicts_with_diff_keys(self): dict1 = {"unique1": 1, "shared1": 2, "shared2": 3} @@ -181,11 +200,13 @@ def test_diff_of_dicts_with_diff_keys(self): # Assert difference is appropriate self.assertListEqual( - expected, utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) + expected, profiler_utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) ) # Assert empty dicts are unchanged - self.assertEqual("unchanged", utils.find_diff_of_dicts_with_diff_keys({}, {})) + self.assertEqual( + "unchanged", profiler_utils.find_diff_of_dicts_with_diff_keys({}, {}) + ) # Assert all edge cases work a = datetime(2021, 6, 28) @@ -215,7 +236,7 @@ def test_diff_of_dicts_with_diff_keys(self): {"unique2": 5}, ] self.assertListEqual( - expected, utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) + expected, profiler_utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) ) dict1 = { @@ -259,14 +280,14 @@ def test_diff_of_dicts_with_diff_keys(self): ] self.assertListEqual( - expected, utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) + expected, profiler_utils.find_diff_of_dicts_with_diff_keys(dict1, dict2) ) def test_list_diff_with_nan(self): # when lists are same length list_1 = [np.nan, 1.5, 6.7] list_2 = [np.nan, 1.5, np.nan] - diff_1 = utils.find_diff_of_lists_and_sets(list_1, list_2) + diff_1 = profiler_utils.find_diff_of_lists_and_sets(list_1, list_2) expected_diff_1 = [[6.7], [np.nan, 1.5], [np.nan]] for x, y in zip(diff_1, expected_diff_1): @@ -276,7 +297,7 @@ def test_list_diff_with_nan(self): # when lists aren't the same length list_3 = [np.nan, 1.5, 6.7, np.nan, np.nan, np.nan] list_4 = [4.2, 1.5, np.nan] - diff_2 = utils.find_diff_of_lists_and_sets(list_3, list_4) + diff_2 = profiler_utils.find_diff_of_lists_and_sets(list_3, list_4) expected_diff_2 = [[6.7, np.nan, np.nan, np.nan], [np.nan, 1.5], [4.2]] for x, y in zip(diff_2, expected_diff_2): @@ -285,7 +306,7 @@ def test_list_diff_with_nan(self): list_5 = [np.nan, np.nan] list_6 = [np.nan] - diff_3 = utils.find_diff_of_lists_and_sets(list_5, list_6) + diff_3 = profiler_utils.find_diff_of_lists_and_sets(list_5, list_6) expected_diff_3 = [[np.nan], [np.nan], []] for x, y in zip(diff_3, expected_diff_3): @@ -294,7 +315,7 @@ def test_list_diff_with_nan(self): list_7 = [np.nan, 3] list_8 = [np.nan, 3] - diff_4 = utils.find_diff_of_lists_and_sets(list_7, list_8) + diff_4 = profiler_utils.find_diff_of_lists_and_sets(list_7, list_8) expected_diff_4 = "unchanged" self.assertEqual(diff_4, expected_diff_4) @@ -307,7 +328,7 @@ def test_find_diff_of_matrices(self): # Check matrix subtraction of same size matrices expected_matrix = [[-10.0, np.nan, 3.0], [3.0, 0.0, 4.0], [np.nan, -12.0, 8.0]] - diff_matrix = utils.find_diff_of_matrices(matrix1, matrix2) + diff_matrix = profiler_utils.find_diff_of_matrices(matrix1, matrix2) comparison = ( (expected_matrix == diff_matrix) | (np.isnan(expected_matrix) & np.isnan(diff_matrix)) @@ -315,13 +336,15 @@ def test_find_diff_of_matrices(self): self.assertEqual(True, comparison) # Check matrix subtraction of same exact matrices - self.assertEqual("unchanged", utils.find_diff_of_matrices(matrix1, matrix1)) + self.assertEqual( + "unchanged", profiler_utils.find_diff_of_matrices(matrix1, matrix1) + ) # Check matrix subtraction with different sized matrices matrix1 = [[1, 2], [1, 2]] - self.assertIsNone(utils.find_diff_of_matrices(matrix1, matrix2)) + self.assertIsNone(profiler_utils.find_diff_of_matrices(matrix1, matrix2)) # Check matrix with none - self.assertIsNone(utils.find_diff_of_matrices(matrix1, None)) + self.assertIsNone(profiler_utils.find_diff_of_matrices(matrix1, None)) def test_get_memory_size(self): """ @@ -333,27 +356,30 @@ def test_get_memory_size(self): "Currently only supports the memory size unit " r"in \['B', 'K', 'M', 'G'\]", ): - utils.get_memory_size([], unit="wrong_unit") + profiler_utils.get_memory_size([], unit="wrong_unit") # test with different data sizes - self.assertEqual(0, utils.get_memory_size([])) + self.assertEqual(0, profiler_utils.get_memory_size([])) self.assertEqual( - 33 / 1024**2, utils.get_memory_size(["This is test, a Test sentence.!!!"]) + 33 / 1024**2, + profiler_utils.get_memory_size(["This is test, a Test sentence.!!!"]), ) self.assertEqual( 33 / 1024**2, - utils.get_memory_size(["This is test,", " a Test sentence.!!!"]), + profiler_utils.get_memory_size(["This is test,", " a Test sentence.!!!"]), ) self.assertEqual( 33 / 1024**3, - utils.get_memory_size(["This is test, a Test sentence.!!!"], unit="G"), + profiler_utils.get_memory_size( + ["This is test, a Test sentence.!!!"], unit="G" + ), ) @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler", spec=BaseDataLabeler) class TestProfileDistributedMerge(unittest.TestCase): """ - Validates utils.merge_profile_list is properly working. + Validates profiler_utils.merge_profile_list is properly working. """ @staticmethod @@ -393,7 +419,9 @@ def test_merge_profile_list(self, mock_data_labeler, *mocks): profile_two = dp.Profiler(data[2:]) list_of_profiles = [profile_one, profile_two] - single_profile = utils.merge_profile_list(list_of_profiles=list_of_profiles) + single_profile = profiler_utils.merge_profile_list( + list_of_profiles=list_of_profiles + ) single_report = single_profile.report() self.assertEqual(1, len(single_report["data_stats"])) @@ -428,7 +456,9 @@ def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): profile_three = dp.Profiler(data[2:]) list_of_profiles = [profile_one, profile_two, profile_three] - single_profile = utils.merge_profile_list(list_of_profiles=list_of_profiles) + single_profile = profiler_utils.merge_profile_list( + list_of_profiles=list_of_profiles + ) single_report = single_profile.report() self.assertEqual(1, len(single_report["data_stats"])) @@ -439,3 +469,52 @@ def test_odd_merge_profile_list(self, mock_data_labeler, *mocks): self.assertEqual(1, single_report["data_stats"][0]["statistics"]["min"]) self.assertEqual(60.0, single_report["data_stats"][0]["statistics"]["max"]) + + +class TestAutoMultiProcessToggle(unittest.TestCase): + + """ + Validate profile_utils.auto_multiprocess_toggle is properly working. + """ + + def test_auto_multiprocess_toggle(self): + rows_threshold = 5 + cols_threshold = 10 + + # Test for no multiprocessing for sufficiently small datasets + data = pd.DataFrame(np.random.random((2, 5))) + self.assertFalse( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + data = pd.DataFrame(np.random.random((5, 10))) + self.assertFalse( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + + # Test for multiprocessing with only rows passing threshold + data = pd.DataFrame(np.random.random((6, 10))) + self.assertTrue( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + + # Test for multiprocessing with only columns passing threshold + data = pd.DataFrame(np.random.random((5, 11))) + self.assertTrue( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) + + # Test for multiprocessing with both rows and columns passing threshold + data = pd.DataFrame(np.random.random((6, 11))) + self.assertTrue( + profiler_utils.auto_multiprocess_toggle( + data, rows_threshold, cols_threshold + ) + ) diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 0d578c6e9..98b87acbe 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from dataprofiler.profilers import TextColumn, utils +from dataprofiler.profilers import TextColumn, profiler_utils from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import TextOptions @@ -584,7 +584,7 @@ def test_diff(self): "median_absolute_deviation": -0.5, "variance": profile1["variance"] - profile2["variance"], "stddev": profile1["stddev"] - profiler2["stddev"], - "vocab": utils.find_diff_of_lists_and_sets( + "vocab": profiler_utils.find_diff_of_lists_and_sets( profile1["vocab"], profile2["vocab"] ), "t-test": { diff --git a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py index d3a47108b..c7aa8b0c5 100644 --- a/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py +++ b/dataprofiler/tests/profilers/test_unstructured_labeler_profile.py @@ -4,7 +4,7 @@ import pandas as pd -from dataprofiler.profilers import utils +from dataprofiler.profilers import profiler_utils from dataprofiler.profilers.unstructured_labeler_profile import ( UnstructuredLabelerProfile, ) diff --git a/dataprofiler/tests/profilers/utils.py b/dataprofiler/tests/profilers/utils.py index bef54763d..d85dbf380 100644 --- a/dataprofiler/tests/profilers/utils.py +++ b/dataprofiler/tests/profilers/utils.py @@ -11,7 +11,7 @@ from dataprofiler.profilers.column_profile_compilers import BaseCompiler from dataprofiler.profilers.profile_builder import BaseProfiler, StructuredColProfiler from dataprofiler.profilers.profiler_options import BaseOption -from dataprofiler.profilers.utils import find_diff_of_dicts +from dataprofiler.profilers.profiler_utils import find_diff_of_dicts from dataprofiler.tests.test_utils import patched_assert_warns diff --git a/dataprofiler/tests/test_rng_utils.py b/dataprofiler/tests/test_rng_utils.py new file mode 100644 index 000000000..6ee2ed35c --- /dev/null +++ b/dataprofiler/tests/test_rng_utils.py @@ -0,0 +1,53 @@ +"""Validates that generator intakes DATAPROFILER_SEED properly.""" +import os +import unittest +import unittest.mock + +from .. import rng_utils + + +class TestGetRandomNumberGenerator(unittest.TestCase): + """Validates get_random_number_generator() is properly working.""" + + @unittest.mock.patch.dict(os.environ, {"DATAPROFILER_SEED": "0"}) + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=123) + def test_dataprofiler_seed_true_settings_seed_false(self): + """Test for DATAPROFILER_SEED in os.environ and settings._seed!=None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 1) + mock_np_generator.assert_called_with(123) + + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=None) + @unittest.mock.patch.dict("os.environ", clear=True) + def test_dataprofiler_seed_false_settings_seed_true(self): + """Test for DATAPROFILER_SEED not in os.environ and settings._seed==None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 1) + mock_np_generator.assert_called_with(None) + + @unittest.mock.patch.dict(os.environ, {"DATAPROFILER_SEED": "123"}) + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=None) + def test_dataprofiler_seed_true_settings_seed_true(self): + """Test for DATAPROFILER_SEED in os.environ and settings._seed==None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 2) + mock_np_generator.assert_called_with(123) + + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=123) + @unittest.mock.patch.dict("os.environ", clear=True) + def test_dataprofiler_seed_false_settings_seed_false(self): + """Test for DATAPROFILER_SEED not in os.environ and settings._seed!=None.""" + with unittest.mock.patch("numpy.random.default_rng") as mock_np_generator: + rng_utils.get_random_number_generator() + self.assertEqual(mock_np_generator.call_count, 1) + mock_np_generator.assert_called_with(123) + + @unittest.mock.patch.dict(os.environ, {"DATAPROFILER_SEED": "George Washington"}) + @unittest.mock.patch("dataprofiler.rng_utils.settings._seed", new=None) + def test_warning_raised(self): + """Test that warning raises if seed is not an integer.""" + with self.assertWarnsRegex(RuntimeWarning, "Seed should be an integer"): + rng_utils.get_random_number_generator() diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 070b4c9e3..0808d3c3e 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -2,7 +2,7 @@ MAJOR = 0 MINOR = 10 -MICRO = 2 +MICRO = 3 POST = None # otherwise None VERSION = "%d.%d.%d" % (MAJOR, MINOR, MICRO) diff --git a/examples/structured_profilers.ipynb b/examples/structured_profilers.ipynb index 10f9eddd2..b6a4409c9 100644 --- a/examples/structured_profilers.ipynb +++ b/examples/structured_profilers.ipynb @@ -245,7 +245,7 @@ "\n", "Below, let's remove the histogram and increase the number of samples to the labeler component (1,000 samples). \n", "\n", - "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." ] }, { diff --git a/examples/unstructured_profilers.ipynb b/examples/unstructured_profilers.ipynb index 82169af5a..9ab754cc7 100644 --- a/examples/unstructured_profilers.ipynb +++ b/examples/unstructured_profilers.ipynb @@ -178,7 +178,7 @@ "\n", "Below, let's remove the vocab count and set the stop words. \n", "\n", - "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler/profile_options.html)." ] }, { diff --git a/requirements.txt b/requirements.txt index 8532aaabf..7c8aa0b99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ fastavro>=1.0.0.post1 python-snappy>=0.5.4 charset-normalizer>=1.3.6 psutil>=4.0.0 -scipy>=1.4.1 +scipy>=1.10.0 requests>=2.28.1 networkx>=2.5.1 typing-extensions>=3.10.0.2