Skip to content

Commit

Permalink
Make _assimilate_histogram not use self (#1071)
Browse files Browse the repository at this point in the history
Co-authored-by: Taylor Turner <[email protected]>
  • Loading branch information
junholee6a and taylorfturner authored Jan 10, 2024
1 parent 683a91e commit 4a4329d
Showing 1 changed file with 19 additions and 7 deletions.
26 changes: 19 additions & 7 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pandas as pd
import scipy.stats

from . import histogram_utils, profiler_utils
from . import float_column_profile, histogram_utils, profiler_utils
from .base_column_profilers import BaseColumnProfiler
from .profiler_options import NumericalOptions

Expand Down Expand Up @@ -710,6 +710,9 @@ def _preprocess_for_calculate_psi(
entity_count_per_bin=self_histogram["bin_counts"],
bin_edges=self_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
is_float_histogram=isinstance(
self, float_column_profile.FloatColumn
),
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
Expand All @@ -731,6 +734,9 @@ def _preprocess_for_calculate_psi(
entity_count_per_bin=other_histogram["bin_counts"],
bin_edges=other_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
is_float_histogram=isinstance(
self, float_column_profile.FloatColumn
),
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
Expand Down Expand Up @@ -1360,7 +1366,12 @@ def _update_histogram(self, df_series: pd.Series) -> None:
self._stored_histogram["total_loss"] += histogram_loss

def _regenerate_histogram(
self, entity_count_per_bin, bin_edges, suggested_bin_count, options=None
self,
entity_count_per_bin,
bin_edges,
suggested_bin_count,
is_float_histogram,
options=None,
) -> tuple[dict[str, np.ndarray], float]:

# create proper binning
Expand All @@ -1372,6 +1383,11 @@ def _regenerate_histogram(
new_bin_edges = np.linspace(
options["min_edge"], options["max_edge"], suggested_bin_count + 1
)

# if it's not a float histogram, then assume it only contains integer values
if not is_float_histogram:
bin_edges = np.round(bin_edges)

return self._assimilate_histogram(
from_hist_entity_count_per_bin=entity_count_per_bin,
from_hist_bin_edges=bin_edges,
Expand Down Expand Up @@ -1417,11 +1433,6 @@ def _assimilate_histogram(

bin_edge = from_hist_bin_edges[bin_id : bin_id + 3]

# if we know not float, we can assume values in bins are integers.
is_float_profile = self.__class__.__name__ == "FloatColumn"
if not is_float_profile:
bin_edge = np.round(bin_edge)

# loop until we have a new bin which contains the current bin.
while (
bin_edge[0] >= dest_hist_bin_edges[new_bin_id + 1]
Expand Down Expand Up @@ -1513,6 +1524,7 @@ def _histogram_for_profile(
entity_count_per_bin=bin_counts,
bin_edges=bin_edges,
suggested_bin_count=suggested_bin_count,
is_float_histogram=isinstance(self, float_column_profile.FloatColumn),
)

def _get_best_histogram_for_profile(self) -> dict:
Expand Down

0 comments on commit 4a4329d

Please sign in to comment.