diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 6086b575..fa0666a6 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -13,7 +13,7 @@ import pandas as pd import scipy.stats -from . import histogram_utils, profiler_utils +from . import float_column_profile, histogram_utils, profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import NumericalOptions @@ -710,6 +710,9 @@ def _preprocess_for_calculate_psi( entity_count_per_bin=self_histogram["bin_counts"], bin_edges=self_histogram["bin_edges"], suggested_bin_count=num_psi_bins, + is_float_histogram=isinstance( + self, float_column_profile.FloatColumn + ), options={ "min_edge": min_min_edge, "max_edge": max_max_edge, @@ -731,6 +734,9 @@ def _preprocess_for_calculate_psi( entity_count_per_bin=other_histogram["bin_counts"], bin_edges=other_histogram["bin_edges"], suggested_bin_count=num_psi_bins, + is_float_histogram=isinstance( + self, float_column_profile.FloatColumn + ), options={ "min_edge": min_min_edge, "max_edge": max_max_edge, @@ -1360,7 +1366,12 @@ def _update_histogram(self, df_series: pd.Series) -> None: self._stored_histogram["total_loss"] += histogram_loss def _regenerate_histogram( - self, entity_count_per_bin, bin_edges, suggested_bin_count, options=None + self, + entity_count_per_bin, + bin_edges, + suggested_bin_count, + is_float_histogram, + options=None, ) -> tuple[dict[str, np.ndarray], float]: # create proper binning @@ -1372,6 +1383,11 @@ def _regenerate_histogram( new_bin_edges = np.linspace( options["min_edge"], options["max_edge"], suggested_bin_count + 1 ) + + # if it's not a float histogram, then assume it only contains integer values + if not is_float_histogram: + bin_edges = np.round(bin_edges) + return self._assimilate_histogram( from_hist_entity_count_per_bin=entity_count_per_bin, from_hist_bin_edges=bin_edges, @@ -1417,11 +1433,6 @@ def _assimilate_histogram( bin_edge = from_hist_bin_edges[bin_id : bin_id + 3] - # if we know not float, we can assume values in bins are integers. - is_float_profile = self.__class__.__name__ == "FloatColumn" - if not is_float_profile: - bin_edge = np.round(bin_edge) - # loop until we have a new bin which contains the current bin. while ( bin_edge[0] >= dest_hist_bin_edges[new_bin_id + 1] @@ -1513,6 +1524,7 @@ def _histogram_for_profile( entity_count_per_bin=bin_counts, bin_edges=bin_edges, suggested_bin_count=suggested_bin_count, + is_float_histogram=isinstance(self, float_column_profile.FloatColumn), ) def _get_best_histogram_for_profile(self) -> dict: