From 6f2a6dd0761e392fe5ea4f6ad2172dfc242f6820 Mon Sep 17 00:00:00 2001
From: Junho Lee <junholee6a@gmail.com>
Date: Sat, 5 Aug 2023 15:49:26 -0400
Subject: [PATCH 1/4] Move functions to histogram_utils

The methods _regenerate_histogram and _assimilate_histogram hardly use
self and can be moved into the histogram_utils module.
---
 dataprofiler/profilers/histogram_utils.py     | 113 +++++++++++++++-
 .../profilers/numerical_column_stats.py       | 128 ++----------------
 2 files changed, 122 insertions(+), 119 deletions(-)

diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py
index df230c4c7..6cb6ceebf 100644
--- a/dataprofiler/profilers/histogram_utils.py
+++ b/dataprofiler/profilers/histogram_utils.py
@@ -8,7 +8,7 @@
 https://github.com/numpy/numpy/blob/main/LICENSE.txt
 """
 import operator
-from typing import List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
 from numpy.lib.histograms import (  # type: ignore[attr-defined]
@@ -348,3 +348,114 @@ def _calculate_bins_from_profile(profile, bin_method):
             # the IQR of the data is zero.
             n_equal_bins = 1
     return n_equal_bins
+
+
+def _assimilate_histogram(
+    from_hist_entity_count_per_bin: np.ndarray,
+    from_hist_bin_edges: np.ndarray,
+    dest_hist_entity_count_per_bin: np.ndarray,
+    dest_hist_bin_edges: np.ndarray,
+    dest_hist_num_bin: int,
+    is_float_profile: bool,
+) -> tuple[dict[str, np.ndarray[Any, Any]], float]:
+    """
+    Assimilates a histogram into another histogram using specifications.
+
+    :param from_hist_entity_count_per_bin: the breakdown of number of entities
+        within a given histogram bin (to be assimilated)
+    :type from_hist_entity_count_per_bin: List[float]
+    :param from_hist_bin_edges: List of value ranges for histogram bins
+        (to be assimilated)
+    :type from_hist_bin_edges: List[Tuple[float]]
+    :param from_hist_entity_count_per_bin: the breakdown of number of
+        entities within a given histogram bin (assimilated to)
+    :type dest_hist_entity_count_per_bin: List[float]
+    :param dest_hist_bin_edges: List of value ranges for histogram bins
+        (assimilated to)
+    :type dest_hist_bin_edges: List[Tuple[float]]
+    :param dest_hist_num_bin: The number of bins desired for histogram
+    :type dest_hist_num_bin: int
+    :param is_float_profile: Whether values in bins are floats
+    :type is_float_profile: bool
+    :return: Tuple containing dictionary of histogram info and histogram loss
+    """
+    # allocate bin_counts
+    new_bin_id = 0
+    hist_loss = 0
+    for bin_id, bin_count in enumerate(from_hist_entity_count_per_bin):
+        if not bin_count:  # if nothing in bin, nothing to add
+            continue
+
+        bin_edge = from_hist_bin_edges[bin_id : bin_id + 3]
+
+        # if we know not float, we can assume values in bins are integers.
+        if not is_float_profile:
+            bin_edge = np.round(bin_edge)
+
+        # loop until we have a new bin which contains the current bin.
+        while (
+            bin_edge[0] >= dest_hist_bin_edges[new_bin_id + 1]
+            and new_bin_id < dest_hist_num_bin - 1
+        ):
+            new_bin_id += 1
+
+        new_bin_edge = dest_hist_bin_edges[new_bin_id : new_bin_id + 3]
+
+        # find where the current bin falls within the new bins
+        is_last_bin = new_bin_id == dest_hist_num_bin - 1
+        if bin_edge[1] < new_bin_edge[1] or is_last_bin:
+            # current bin is within the new bin
+            dest_hist_entity_count_per_bin[new_bin_id] += bin_count
+            hist_loss += (
+                ((new_bin_edge[1] + new_bin_edge[0]) - (bin_edge[1] + bin_edge[0])) / 2
+            ) ** 2 * bin_count
+        elif bin_edge[0] < new_bin_edge[1]:
+            # current bin straddles two of the new bins
+            # get the percentage of bin that falls to the left
+            percentage_in_left_bin = (new_bin_edge[1] - bin_edge[0]) / (
+                bin_edge[1] - bin_edge[0]
+            )
+            count_in_left_bin = round(bin_count * percentage_in_left_bin)
+            dest_hist_entity_count_per_bin[new_bin_id] += count_in_left_bin
+            hist_loss += (
+                ((new_bin_edge[1] + new_bin_edge[0]) - (bin_edge[1] + bin_edge[0])) / 2
+            ) ** 2 * count_in_left_bin
+
+            # allocate leftovers to the right bin
+            dest_hist_entity_count_per_bin[new_bin_id + 1] += (
+                bin_count - count_in_left_bin
+            )
+            hist_loss += (
+                ((new_bin_edge[2] + new_bin_edge[1]) - (bin_edge[1] + bin_edge[0])) / 2
+            ) ** 2 * (bin_count - count_in_left_bin)
+
+            # increment bin id to the right bin
+            new_bin_id += 1
+    return (
+        {
+            "bin_edges": dest_hist_bin_edges,
+            "bin_counts": dest_hist_entity_count_per_bin,
+        },
+        hist_loss,
+    )
+
+
+def _regenerate_histogram(
+    entity_count_per_bin, bin_edges, suggested_bin_count, is_float_profile, options=None
+) -> tuple[dict[str, np.ndarray], float]:
+
+    # create proper binning
+    new_bin_counts = np.zeros((suggested_bin_count,))
+    new_bin_edges = np.linspace(bin_edges[0], bin_edges[-1], suggested_bin_count + 1)
+    if options:
+        new_bin_edges = np.linspace(
+            options["min_edge"], options["max_edge"], suggested_bin_count + 1
+        )
+    return _assimilate_histogram(
+        from_hist_entity_count_per_bin=entity_count_per_bin,
+        from_hist_bin_edges=bin_edges,
+        dest_hist_entity_count_per_bin=new_bin_counts,
+        dest_hist_bin_edges=new_bin_edges,
+        dest_hist_num_bin=suggested_bin_count,
+        is_float_profile=is_float_profile,
+    )
diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
index 2b35c8792..3dbe3b8aa 100644
--- a/dataprofiler/profilers/numerical_column_stats.py
+++ b/dataprofiler/profilers/numerical_column_stats.py
@@ -226,7 +226,7 @@ def _add_helper_merge_profile_histograms(
         new_entity_count_by_bin = np.zeros((ideal_count_of_bins,))
 
         # Generate new histograms
-        _, hist_loss1 = self._assimilate_histogram(
+        _, hist_loss1 = histogram_utils._assimilate_histogram(
             from_hist_entity_count_per_bin=other1._stored_histogram["histogram"][
                 "bin_counts"
             ],
@@ -234,10 +234,11 @@ def _add_helper_merge_profile_histograms(
             dest_hist_entity_count_per_bin=new_entity_count_by_bin,
             dest_hist_bin_edges=ideal_bin_edges,
             dest_hist_num_bin=ideal_count_of_bins,
+            is_float_profile=self.__class__.__name__ == "FloatColumn",
         )
 
         # Ensure loss is calculated on second run of regenerate
-        _, hist_loss2 = self._assimilate_histogram(
+        _, hist_loss2 = histogram_utils._assimilate_histogram(
             from_hist_entity_count_per_bin=other2._stored_histogram["histogram"][
                 "bin_counts"
             ],
@@ -245,6 +246,7 @@ def _add_helper_merge_profile_histograms(
             dest_hist_entity_count_per_bin=new_entity_count_by_bin,
             dest_hist_bin_edges=ideal_bin_edges,
             dest_hist_num_bin=ideal_count_of_bins,
+            is_float_profile=self.__class__.__name__ == "FloatColumn",
         )
 
         aggregate_histogram_loss = hist_loss1 + hist_loss2
@@ -706,10 +708,11 @@ def _preprocess_for_calculate_psi(
 
             # re-calculate `self` histogram
             if len_self_bin_counts != num_psi_bins:
-                histogram, hist_loss = self._regenerate_histogram(
+                histogram, hist_loss = histogram_utils._regenerate_histogram(
                     entity_count_per_bin=self_histogram["bin_counts"],
                     bin_edges=self_histogram["bin_edges"],
                     suggested_bin_count=num_psi_bins,
+                    is_float_profile=self.__class__.__name__ == "FloatColumn",
                     options={
                         "min_edge": min_min_edge,
                         "max_edge": max_max_edge,
@@ -727,10 +730,11 @@ def _preprocess_for_calculate_psi(
                 histogram_edges_not_equal = True
 
             if histogram_edges_not_equal:
-                histogram, hist_loss = self._regenerate_histogram(
+                histogram, hist_loss = histogram_utils._regenerate_histogram(
                     entity_count_per_bin=other_histogram["bin_counts"],
                     bin_edges=other_histogram["bin_edges"],
                     suggested_bin_count=num_psi_bins,
+                    is_float_profile=self.__class__.__name__ == "FloatColumn",
                     options={
                         "min_edge": min_min_edge,
                         "max_edge": max_max_edge,
@@ -1359,119 +1363,6 @@ def _update_histogram(self, df_series: pd.Series) -> None:
         self._stored_histogram["current_loss"] = histogram_loss
         self._stored_histogram["total_loss"] += histogram_loss
 
-    def _regenerate_histogram(
-        self, entity_count_per_bin, bin_edges, suggested_bin_count, options=None
-    ) -> tuple[dict[str, np.ndarray], float]:
-
-        # create proper binning
-        new_bin_counts = np.zeros((suggested_bin_count,))
-        new_bin_edges = np.linspace(
-            bin_edges[0], bin_edges[-1], suggested_bin_count + 1
-        )
-        if options:
-            new_bin_edges = np.linspace(
-                options["min_edge"], options["max_edge"], suggested_bin_count + 1
-            )
-        return self._assimilate_histogram(
-            from_hist_entity_count_per_bin=entity_count_per_bin,
-            from_hist_bin_edges=bin_edges,
-            dest_hist_entity_count_per_bin=new_bin_counts,
-            dest_hist_bin_edges=new_bin_edges,
-            dest_hist_num_bin=suggested_bin_count,
-        )
-
-    def _assimilate_histogram(
-        self,
-        from_hist_entity_count_per_bin: np.ndarray,
-        from_hist_bin_edges: np.ndarray,
-        dest_hist_entity_count_per_bin: np.ndarray,
-        dest_hist_bin_edges: np.ndarray,
-        dest_hist_num_bin: int,
-    ) -> tuple[dict[str, np.ndarray[Any, Any]], float]:
-        """
-        Assimilates a histogram into another histogram using specifications.
-
-        :param from_hist_entity_count_per_bin: the breakdown of number of entities
-            within a given histogram bin (to be assimilated)
-        :type from_hist_entity_count_per_bin: List[float]
-        :param from_hist_bin_edges: List of value ranges for histogram bins
-            (to be assimilated)
-        :type from_hist_bin_edges: List[Tuple[float]]
-        :param from_hist_entity_count_per_bin: the breakdown of number of
-            entities within a given histogram bin (assimilated to)
-        :type dest_hist_entity_count_per_bin: List[float]
-        :param dest_hist_bin_edges: List of value ranges for histogram bins
-            (assimilated to)
-        :type dest_hist_bin_edges: List[Tuple[float]]
-        :type dest_hist_bin_edges: List[Tuple[float]
-        :param dest_hist_num_bin: The number of bins desired for histogram
-        :type dest_hist_num_bin: int
-        :return: Tuple containing dictionary of histogram info and histogram loss
-        """
-        # allocate bin_counts
-        new_bin_id = 0
-        hist_loss = 0
-        for bin_id, bin_count in enumerate(from_hist_entity_count_per_bin):
-            if not bin_count:  # if nothing in bin, nothing to add
-                continue
-
-            bin_edge = from_hist_bin_edges[bin_id : bin_id + 3]
-
-            # if we know not float, we can assume values in bins are integers.
-            is_float_profile = self.__class__.__name__ == "FloatColumn"
-            if not is_float_profile:
-                bin_edge = np.round(bin_edge)
-
-            # loop until we have a new bin which contains the current bin.
-            while (
-                bin_edge[0] >= dest_hist_bin_edges[new_bin_id + 1]
-                and new_bin_id < dest_hist_num_bin - 1
-            ):
-                new_bin_id += 1
-
-            new_bin_edge = dest_hist_bin_edges[new_bin_id : new_bin_id + 3]
-
-            # find where the current bin falls within the new bins
-            is_last_bin = new_bin_id == dest_hist_num_bin - 1
-            if bin_edge[1] < new_bin_edge[1] or is_last_bin:
-                # current bin is within the new bin
-                dest_hist_entity_count_per_bin[new_bin_id] += bin_count
-                hist_loss += (
-                    ((new_bin_edge[1] + new_bin_edge[0]) - (bin_edge[1] + bin_edge[0]))
-                    / 2
-                ) ** 2 * bin_count
-            elif bin_edge[0] < new_bin_edge[1]:
-                # current bin straddles two of the new bins
-                # get the percentage of bin that falls to the left
-                percentage_in_left_bin = (new_bin_edge[1] - bin_edge[0]) / (
-                    bin_edge[1] - bin_edge[0]
-                )
-                count_in_left_bin = round(bin_count * percentage_in_left_bin)
-                dest_hist_entity_count_per_bin[new_bin_id] += count_in_left_bin
-                hist_loss += (
-                    ((new_bin_edge[1] + new_bin_edge[0]) - (bin_edge[1] + bin_edge[0]))
-                    / 2
-                ) ** 2 * count_in_left_bin
-
-                # allocate leftovers to the right bin
-                dest_hist_entity_count_per_bin[new_bin_id + 1] += (
-                    bin_count - count_in_left_bin
-                )
-                hist_loss += (
-                    ((new_bin_edge[2] + new_bin_edge[1]) - (bin_edge[1] + bin_edge[0]))
-                    / 2
-                ) ** 2 * (bin_count - count_in_left_bin)
-
-                # increment bin id to the right bin
-                new_bin_id += 1
-        return (
-            {
-                "bin_edges": dest_hist_bin_edges,
-                "bin_counts": dest_hist_entity_count_per_bin,
-            },
-            hist_loss,
-        )
-
     def _histogram_for_profile(
         self, histogram_method: str
     ) -> tuple[dict[str, np.ndarray], float]:
@@ -1509,10 +1400,11 @@ def _histogram_for_profile(
                 self._stored_histogram["total_loss"],
             )
 
-        return self._regenerate_histogram(
+        return histogram_utils._regenerate_histogram(
             entity_count_per_bin=bin_counts,
             bin_edges=bin_edges,
             suggested_bin_count=suggested_bin_count,
+            is_float_profile=self.__class__.__name__ == "FloatColumn",
         )
 
     def _get_best_histogram_for_profile(self) -> dict:

From 0e324ccfb7bf06c8ae1d30851519ca5b365b1dbd Mon Sep 17 00:00:00 2001
From: Junho Lee <junholee6a@gmail.com>
Date: Tue, 8 Aug 2023 22:57:30 -0400
Subject: [PATCH 2/4] Remove is_float_column parameter

---
 dataprofiler/profilers/histogram_utils.py        | 10 +---------
 dataprofiler/profilers/numerical_column_stats.py |  5 -----
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py
index 6cb6ceebf..9cf77a382 100644
--- a/dataprofiler/profilers/histogram_utils.py
+++ b/dataprofiler/profilers/histogram_utils.py
@@ -356,7 +356,6 @@ def _assimilate_histogram(
     dest_hist_entity_count_per_bin: np.ndarray,
     dest_hist_bin_edges: np.ndarray,
     dest_hist_num_bin: int,
-    is_float_profile: bool,
 ) -> tuple[dict[str, np.ndarray[Any, Any]], float]:
     """
     Assimilates a histogram into another histogram using specifications.
@@ -375,8 +374,6 @@ def _assimilate_histogram(
     :type dest_hist_bin_edges: List[Tuple[float]]
     :param dest_hist_num_bin: The number of bins desired for histogram
     :type dest_hist_num_bin: int
-    :param is_float_profile: Whether values in bins are floats
-    :type is_float_profile: bool
     :return: Tuple containing dictionary of histogram info and histogram loss
     """
     # allocate bin_counts
@@ -388,10 +385,6 @@ def _assimilate_histogram(
 
         bin_edge = from_hist_bin_edges[bin_id : bin_id + 3]
 
-        # if we know not float, we can assume values in bins are integers.
-        if not is_float_profile:
-            bin_edge = np.round(bin_edge)
-
         # loop until we have a new bin which contains the current bin.
         while (
             bin_edge[0] >= dest_hist_bin_edges[new_bin_id + 1]
@@ -441,7 +434,7 @@ def _assimilate_histogram(
 
 
 def _regenerate_histogram(
-    entity_count_per_bin, bin_edges, suggested_bin_count, is_float_profile, options=None
+    entity_count_per_bin, bin_edges, suggested_bin_count, options=None
 ) -> tuple[dict[str, np.ndarray], float]:
 
     # create proper binning
@@ -457,5 +450,4 @@ def _regenerate_histogram(
         dest_hist_entity_count_per_bin=new_bin_counts,
         dest_hist_bin_edges=new_bin_edges,
         dest_hist_num_bin=suggested_bin_count,
-        is_float_profile=is_float_profile,
     )
diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
index 3dbe3b8aa..3d4531da5 100644
--- a/dataprofiler/profilers/numerical_column_stats.py
+++ b/dataprofiler/profilers/numerical_column_stats.py
@@ -234,7 +234,6 @@ def _add_helper_merge_profile_histograms(
             dest_hist_entity_count_per_bin=new_entity_count_by_bin,
             dest_hist_bin_edges=ideal_bin_edges,
             dest_hist_num_bin=ideal_count_of_bins,
-            is_float_profile=self.__class__.__name__ == "FloatColumn",
         )
 
         # Ensure loss is calculated on second run of regenerate
@@ -246,7 +245,6 @@ def _add_helper_merge_profile_histograms(
             dest_hist_entity_count_per_bin=new_entity_count_by_bin,
             dest_hist_bin_edges=ideal_bin_edges,
             dest_hist_num_bin=ideal_count_of_bins,
-            is_float_profile=self.__class__.__name__ == "FloatColumn",
         )
 
         aggregate_histogram_loss = hist_loss1 + hist_loss2
@@ -712,7 +710,6 @@ def _preprocess_for_calculate_psi(
                     entity_count_per_bin=self_histogram["bin_counts"],
                     bin_edges=self_histogram["bin_edges"],
                     suggested_bin_count=num_psi_bins,
-                    is_float_profile=self.__class__.__name__ == "FloatColumn",
                     options={
                         "min_edge": min_min_edge,
                         "max_edge": max_max_edge,
@@ -734,7 +731,6 @@ def _preprocess_for_calculate_psi(
                     entity_count_per_bin=other_histogram["bin_counts"],
                     bin_edges=other_histogram["bin_edges"],
                     suggested_bin_count=num_psi_bins,
-                    is_float_profile=self.__class__.__name__ == "FloatColumn",
                     options={
                         "min_edge": min_min_edge,
                         "max_edge": max_max_edge,
@@ -1404,7 +1400,6 @@ def _histogram_for_profile(
             entity_count_per_bin=bin_counts,
             bin_edges=bin_edges,
             suggested_bin_count=suggested_bin_count,
-            is_float_profile=self.__class__.__name__ == "FloatColumn",
         )
 
     def _get_best_histogram_for_profile(self) -> dict:

From c9fba268f9924044c1906c3dd0bb4d6fd7dfeab5 Mon Sep 17 00:00:00 2001
From: Junho Lee <junholee6a@gmail.com>
Date: Sat, 2 Sep 2023 16:02:27 -0400
Subject: [PATCH 3/4] Use Dict, Tuple instead of dict, tuple

---
 dataprofiler/profilers/histogram_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py
index 9cf77a382..c3e71a67f 100644
--- a/dataprofiler/profilers/histogram_utils.py
+++ b/dataprofiler/profilers/histogram_utils.py
@@ -8,7 +8,7 @@
 https://github.com/numpy/numpy/blob/main/LICENSE.txt
 """
 import operator
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from numpy.lib.histograms import (  # type: ignore[attr-defined]
@@ -356,7 +356,7 @@ def _assimilate_histogram(
     dest_hist_entity_count_per_bin: np.ndarray,
     dest_hist_bin_edges: np.ndarray,
     dest_hist_num_bin: int,
-) -> tuple[dict[str, np.ndarray[Any, Any]], float]:
+) -> Tuple[Dict[str, np.ndarray[Any, Any]], float]:
     """
     Assimilates a histogram into another histogram using specifications.
 
@@ -435,7 +435,7 @@ def _assimilate_histogram(
 
 def _regenerate_histogram(
     entity_count_per_bin, bin_edges, suggested_bin_count, options=None
-) -> tuple[dict[str, np.ndarray], float]:
+) -> Tuple[Dict[str, np.ndarray], float]:
 
     # create proper binning
     new_bin_counts = np.zeros((suggested_bin_count,))

From c840415363e8a575e99bfb87c3ce28807a52b2eb Mon Sep 17 00:00:00 2001
From: Junho Lee <junholee6a@gmail.com>
Date: Sat, 2 Sep 2023 16:15:31 -0400
Subject: [PATCH 4/4] Remove np.ndarray type subscription

---
 dataprofiler/profilers/histogram_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py
index c3e71a67f..8870a2142 100644
--- a/dataprofiler/profilers/histogram_utils.py
+++ b/dataprofiler/profilers/histogram_utils.py
@@ -8,7 +8,7 @@
 https://github.com/numpy/numpy/blob/main/LICENSE.txt
 """
 import operator
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from numpy.lib.histograms import (  # type: ignore[attr-defined]
@@ -356,7 +356,7 @@ def _assimilate_histogram(
     dest_hist_entity_count_per_bin: np.ndarray,
     dest_hist_bin_edges: np.ndarray,
     dest_hist_num_bin: int,
-) -> Tuple[Dict[str, np.ndarray[Any, Any]], float]:
+) -> Tuple[Dict[str, np.ndarray], float]:
     """
     Assimilates a histogram into another histogram using specifications.