diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 4d7044f9..c1767370 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -84,6 +84,7 @@ def __init__(self, options: NumericalOptions = None) -> None: self.num_zeros: int | np.int64 = np.int64(0) self.num_negatives: int | np.int64 = np.int64(0) self._num_quantiles: int = 1000 # By default, we use 1000 quantiles + self._greater_than_64_bit: bool = False if options: self.bias_correction = options.bias_correction.is_enabled @@ -1336,7 +1337,7 @@ def _merge_histogram(self, values: np.ndarray | pd.Series) -> None: self._stored_histogram["histogram"]["bin_counts"] = bin_counts self._stored_histogram["histogram"]["bin_edges"] = bin_edges - def _update_histogram(self, df_series: pd.Series) -> None: + def _update_histogram(self, df_series: pd.Series | np.ndarray) -> None: """ Update histogram for each method and the combined method. @@ -1358,7 +1359,8 @@ def _update_histogram(self, df_series: pd.Series) -> None: :return: """ df_series = pl.from_pandas(df_series, nan_to_null=True).cast(pl.Float64) - df_series = df_series.replace([np.inf, -np.inf], [None]).drop_nulls() + df_series = df_series.replace([np.inf, -np.inf], [None]) # type: ignore + df_series = df_series.drop_nulls() if df_series.is_empty(): return @@ -1751,13 +1753,26 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: :type profile: dict :return: None """ - df_series_clean = pl.from_pandas(df_series_clean) - if df_series_clean.dtype == pl.String: - df_series_clean = df_series_clean.str.strip_chars().cast(pl.Float64) + self._greater_than_64_bit = ( + not df_series_clean.empty + and df_series_clean.apply(pd.to_numeric, errors="coerce").dtype == "O" + ) + if self._greater_than_64_bit: + df_series_clean = df_series_clean.to_numpy() + df_series_clean = df_series_clean[df_series_clean != np.nan] + if df_series_clean.size == 0: + return + df_series_clean = pd.Series(df_series_clean) else: - df_series_clean = df_series_clean.cast(pl.Float64) - if df_series_clean.is_empty(): - return + df_series_clean = pl.from_pandas(df_series_clean) + if df_series_clean.dtype == pl.String: + df_series_clean = df_series_clean.str.strip_chars().cast(pl.Float64) + else: + df_series_clean = df_series_clean.cast(pl.Float64) + if df_series_clean.is_empty(): + return + df_series_clean = df_series_clean.to_pandas() + df_series_clean = df_series_clean.astype(float) prev_dependent_properties = { "mean": self.mean, @@ -1766,8 +1781,6 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: "biased_kurtosis": self._biased_kurtosis, } subset_properties = copy.deepcopy(profile) - df_series_clean = df_series_clean.to_pandas() - df_series_clean = df_series_clean.astype(float) super()._perform_property_calcs( # type: ignore self.__calculations, df_series=df_series_clean, @@ -1781,49 +1794,71 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: @BaseColumnProfiler._timeit(name="min") def _get_min( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: - df_series = pl.from_pandas(df_series) - min_value = df_series.min() - if self.min is not None: - min_value = type(self.min)(min_value) - self.min = min_value if not self.min else min(self.min, min_value) + if self._greater_than_64_bit: + min_value = np.min(df_series) + self.min = min_value if not self.min else min(self.min, min_value) + else: + df_series = pl.from_pandas(df_series) + min_value = df_series.min() + if self.min is not None: + min_value = type(self.min)(min_value) + self.min = np.float64( + min_value if not self.min else min(self.min, min_value) + ) subset_properties["min"] = min_value @BaseColumnProfiler._timeit(name="max") def _get_max( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: - df_series = pl.from_pandas(df_series) - max_value = df_series.max() - if self.max is not None: - max_value = type(self.max)(max_value) - self.max = max_value if not self.max else max(self.max, max_value) + if self._greater_than_64_bit: + max_value = np.max(df_series) + self.max = max_value if not self.max else max(self.max, max_value) + else: + df_series = pl.from_pandas(df_series) + max_value = df_series.max() + if self.max is not None: + max_value = type(self.max)(max_value) + self.max = np.float64( + max_value if not self.max else max(self.max, max_value) + ) subset_properties["max"] = max_value @BaseColumnProfiler._timeit(name="sum") def _get_sum( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if np.isinf(self.sum) or (np.isnan(self.sum) and self.match_count > 0): return - # df_series = pl.from_pandas(df_series) - sum_value = df_series.sum() - if np.isinf(sum_value) or (len(df_series) > 0 and np.isnan(sum_value)): - warnings.warn( - "Infinite or invalid values found in data. " - "Future statistics (mean, variance, skewness, kurtosis) " - "will not be computed.", - RuntimeWarning, - ) + if self._greater_than_64_bit: + sum_value = np.sum(df_series) + if len(df_series) > 0 and sum_value == np.nan: + warnings.warn( + "Infinite or invalid values found in data. " + "Future statistics (mean, variance, skewness, kurtosis) " + "will not be computed.", + RuntimeWarning, + ) + else: + df_series = pl.from_pandas(df_series) + sum_value = df_series.sum() + if np.isinf(sum_value) or (len(df_series) > 0 and np.isnan(sum_value)): + warnings.warn( + "Infinite or invalid values found in data. " + "Future statistics (mean, variance, skewness, kurtosis) " + "will not be computed.", + RuntimeWarning, + ) subset_properties["sum"] = sum_value self.sum = self.sum + sum_value @@ -1831,7 +1866,7 @@ def _get_sum( @BaseColumnProfiler._timeit(name="variance") def _get_variance( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1839,29 +1874,31 @@ def _get_variance( np.isnan(self._biased_variance) and self.match_count > 0 ): return - - # Suppress any numpy warnings as we have a custom warning for invalid - # or infinite data already - with np.errstate(all="ignore"): - batch_biased_variance = np.var(df_series) # Obtains biased variance + if self._greater_than_64_bit: + batch_biased_variance = np.var(df_series) + else: + df_series = pl.from_pandas(df_series) + batch_biased_variance = np.var([df_series]) subset_properties["biased_variance"] = batch_biased_variance sum_value = subset_properties["sum"] batch_count = subset_properties["match_count"] batch_mean = 0.0 if not batch_count else float(sum_value) / batch_count subset_properties["mean"] = batch_mean - self._biased_variance = self._merge_biased_variance( - self.match_count, - self._biased_variance, - prev_dependent_properties["mean"], - batch_count, - batch_biased_variance, - batch_mean, + self._biased_variance = np.float64( + self._merge_biased_variance( + self.match_count, + self._biased_variance, + prev_dependent_properties["mean"], + batch_count, + batch_biased_variance, + batch_mean, + ) ) @BaseColumnProfiler._timeit(name="skewness") def _get_skewness( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1905,7 +1942,7 @@ def _get_skewness( @BaseColumnProfiler._timeit(name="kurtosis") def _get_kurtosis( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1952,7 +1989,7 @@ def _get_kurtosis( @BaseColumnProfiler._timeit(name="histogram_and_quantiles") def _get_histogram_and_quantiles( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1970,7 +2007,7 @@ def _get_histogram_and_quantiles( @BaseColumnProfiler._timeit(name="num_zeros") def _get_num_zeros( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1985,6 +2022,8 @@ def _get_num_zeros( :type subset_properties: dict :return: None """ + if not self._greater_than_64_bit: + df_series = pl.from_pandas(df_series) num_zeros_value = (df_series == 0).sum() subset_properties["num_zeros"] = num_zeros_value self.num_zeros = self.num_zeros + num_zeros_value @@ -1992,7 +2031,7 @@ def _get_num_zeros( @BaseColumnProfiler._timeit(name="num_negatives") def _get_num_negatives( self, - df_series: pd.Series, + df_series: pd.Series | np.ndarray, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -2007,6 +2046,8 @@ def _get_num_negatives( :type subset_properties: dict :return: None """ + if not self._greater_than_64_bit: + df_series = pl.from_pandas(df_series) num_negatives_value = (df_series < 0).sum() subset_properties["num_negatives"] = num_negatives_value self.num_negatives = self.num_negatives + num_negatives_value diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index d79fdd64..c92fc5cd 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -1792,6 +1792,7 @@ def test_json_encode(self): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0.0, @@ -1890,6 +1891,7 @@ def test_json_encode_after_update(self, time): "num_zeros": 1, "num_negatives": 0, "_num_quantiles": 4, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 2.0, diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 961b33c8..718348cf 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -1151,6 +1151,7 @@ def test_json_encode(self): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0.0, @@ -1233,6 +1234,7 @@ def test_json_encode_after_update(self, time): "num_zeros": 1, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": { "custom": { "total_loss": 0.0, diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index e112781a..7b4d2ccc 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -1234,6 +1234,7 @@ def test_json_encode(self): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": expected_historam_methods, "_stored_histogram": { "total_loss": 0.0, diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index 12fb1d27..61a54afe 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -671,6 +671,7 @@ def test_json_encode_after_update(self, time): "num_zeros": 0, "num_negatives": 0, "_num_quantiles": 1000, + "_greater_than_64_bit": False, "histogram_methods": { "custom": { "total_loss": 0.0,