Skip to content

Commit

Permalink
Remove is_float_column parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
junholee6a committed Aug 22, 2023
1 parent 6f2a6dd commit 0e324cc
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 14 deletions.
10 changes: 1 addition & 9 deletions dataprofiler/profilers/histogram_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@ def _assimilate_histogram(
dest_hist_entity_count_per_bin: np.ndarray,
dest_hist_bin_edges: np.ndarray,
dest_hist_num_bin: int,
is_float_profile: bool,
) -> tuple[dict[str, np.ndarray[Any, Any]], float]:
"""
Assimilates a histogram into another histogram using specifications.
Expand All @@ -375,8 +374,6 @@ def _assimilate_histogram(
:type dest_hist_bin_edges: List[Tuple[float]]
:param dest_hist_num_bin: The number of bins desired for histogram
:type dest_hist_num_bin: int
:param is_float_profile: Whether values in bins are floats
:type is_float_profile: bool
:return: Tuple containing dictionary of histogram info and histogram loss
"""
# allocate bin_counts
Expand All @@ -388,10 +385,6 @@ def _assimilate_histogram(

bin_edge = from_hist_bin_edges[bin_id : bin_id + 3]

# if we know not float, we can assume values in bins are integers.
if not is_float_profile:
bin_edge = np.round(bin_edge)

# loop until we have a new bin which contains the current bin.
while (
bin_edge[0] >= dest_hist_bin_edges[new_bin_id + 1]
Expand Down Expand Up @@ -441,7 +434,7 @@ def _assimilate_histogram(


def _regenerate_histogram(
entity_count_per_bin, bin_edges, suggested_bin_count, is_float_profile, options=None
entity_count_per_bin, bin_edges, suggested_bin_count, options=None
) -> tuple[dict[str, np.ndarray], float]:

# create proper binning
Expand All @@ -457,5 +450,4 @@ def _regenerate_histogram(
dest_hist_entity_count_per_bin=new_bin_counts,
dest_hist_bin_edges=new_bin_edges,
dest_hist_num_bin=suggested_bin_count,
is_float_profile=is_float_profile,
)
5 changes: 0 additions & 5 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,6 @@ def _add_helper_merge_profile_histograms(
dest_hist_entity_count_per_bin=new_entity_count_by_bin,
dest_hist_bin_edges=ideal_bin_edges,
dest_hist_num_bin=ideal_count_of_bins,
is_float_profile=self.__class__.__name__ == "FloatColumn",
)

# Ensure loss is calculated on second run of regenerate
Expand All @@ -246,7 +245,6 @@ def _add_helper_merge_profile_histograms(
dest_hist_entity_count_per_bin=new_entity_count_by_bin,
dest_hist_bin_edges=ideal_bin_edges,
dest_hist_num_bin=ideal_count_of_bins,
is_float_profile=self.__class__.__name__ == "FloatColumn",
)

aggregate_histogram_loss = hist_loss1 + hist_loss2
Expand Down Expand Up @@ -712,7 +710,6 @@ def _preprocess_for_calculate_psi(
entity_count_per_bin=self_histogram["bin_counts"],
bin_edges=self_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
is_float_profile=self.__class__.__name__ == "FloatColumn",
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
Expand All @@ -734,7 +731,6 @@ def _preprocess_for_calculate_psi(
entity_count_per_bin=other_histogram["bin_counts"],
bin_edges=other_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
is_float_profile=self.__class__.__name__ == "FloatColumn",
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
Expand Down Expand Up @@ -1404,7 +1400,6 @@ def _histogram_for_profile(
entity_count_per_bin=bin_counts,
bin_edges=bin_edges,
suggested_bin_count=suggested_bin_count,
is_float_profile=self.__class__.__name__ == "FloatColumn",
)

def _get_best_histogram_for_profile(self) -> dict:
Expand Down

0 comments on commit 0e324cc

Please sign in to comment.