diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index f0783747..19bb19c6 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -280,7 +280,7 @@ def _get_float_precision( :param df_series_clean: df series with nulls removed, assumes all values are floats as well - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param sample_ratio: Ratio of samples used for float precision :type sample_ratio: float (between 0 and 1) :return: string representing its precision print format @@ -332,9 +332,9 @@ def _is_each_row_float(cls, df_series: pl.Series) -> pl.Series: For column [1.0, np.NaN, 1.0] returns [True, True, True] For column [1.0, "a", "b"] returns [True, False, False] :param df_series: series of values to evaluate - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: is_float_col - :rtype: Union[List[bool], pandas.Series[bool]] + :rtype: pl.Series """ if len(df_series) == 0: return pl.Series() @@ -361,7 +361,7 @@ def _update_precision( subset before they are merged into the main data profile. :type subset_properties: dict :param df_series: Data to be profiled - :type df_series: pandas.DataFrame + :type df_series: polars.DataFrame :return: None """ sample_ratio = None @@ -403,19 +403,18 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: Update column profile properties with cleaned dataset and its known profile. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: float profile dictionary :type profile: dict :return: None """ - df_series_clean = df_series_clean.to_pandas() if self._NumericStatsMixin__calculations: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) def _update_numeric_stats( self, - df_series: pl.DataFrame, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -430,7 +429,7 @@ def _update_numeric_stats( subset before they are merged into the main data profile. :type subset_properties: Dict :param df_series: Data to be profiled - :type df_series: Pandas Dataframe + :type df_series: Polars Dataframe :return: None """ super()._update_helper(df_series, subset_properties) @@ -440,7 +439,7 @@ def update(self, df_series: pl.Series) -> FloatColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: updated FloatColumn :rtype: FloatColumn """ diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 30e7a4a8..15394ce8 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -2,7 +2,6 @@ from __future__ import annotations import numpy as np -import pandas as pd import polars as pl from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler @@ -125,7 +124,7 @@ def _is_each_row_int(cls, df_series: pl.Series) -> list[bool]: For column [1.1 1.1 1.1] returns False :param df_series: series of values to evaluate - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: is_int_col :rtype: list """ @@ -140,12 +139,11 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: Update col profile properties with clean dataset and its known null params. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: int profile dictionary :type profile: dict :return: None """ - df_series_clean = pd.Series(df_series_clean.to_numpy()) if self._NumericStatsMixin__calculations: NumericStatsMixin._update_helper(self, df_series_clean, profile) self._update_column_base_properties(profile) @@ -155,7 +153,7 @@ def update(self, df_series: pl.Series) -> IntColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: updated IntColumn :rtype: IntColumn """ diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 549fcc43..0e8677d2 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -10,7 +10,6 @@ import numpy as np import numpy.typing as npt -import pandas as pd import polars as pl import scipy.stats @@ -498,7 +497,6 @@ def diff( "Unsupported operand type(s) for diff: '{}' " "and '{}'".format(cls.__name__, other_profile.__class__.__name__) ) - print(self.variance, other_profile.variance) differences = { "min": profiler_utils.find_diff_of_numbers(self.min, other_profile.min), "max": profiler_utils.find_diff_of_numbers(self.max, other_profile.max), @@ -1125,10 +1123,9 @@ def _estimate_stats_from_histogram(self) -> np.float64: return var def _total_histogram_bin_variance( - self, input_array: np.ndarray | pd.Series + self, input_array: np.ndarray | pl.Series ) -> float: - if type(input_array) is pd.Series: - input_array = pl.from_pandas(input_array) + if type(input_array) is pl.Series: input_array = input_array.to_numpy() # calculate total variance over all bins of a histogram bin_counts = self._stored_histogram["histogram"]["bin_counts"] @@ -1146,20 +1143,18 @@ def _total_histogram_bin_variance( sum_var += bin_var return sum_var - def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> np.float64: + def _histogram_bin_error(self, input_array: np.ndarray | pl.Series) -> np.float64: """ Calculate error of each value from bin of the histogram it falls within. :param input_array: input data used to calculate the histogram - :type input_array: Union[np.array, pd.Series] + :type input_array: Union[np.array, pl.Series] :return: binning error :rtype: float """ - if type(input_array) is pd.Series: - input_array = pl.from_pandas(input_array) + if type(input_array) == pl.Series: input_array = input_array.to_numpy() - bin_edges = self._stored_histogram["histogram"]["bin_edges"] - + bin_edges = self._stored_histogram["histogram"]["bin_edges"].astype(float) # account ofr digitize which is exclusive bin_edges = bin_edges.copy() @@ -1280,7 +1275,7 @@ def _get_histogram( Uses np.histogram. :param values: input data values - :type values: Union[np.array, pd.Series] + :type values: Union[np.array, pl.Series] :return: bin edges and bin counts """ if len(np.unique(values)) == 1: @@ -1323,18 +1318,17 @@ def _get_histogram( bin_counts, bin_edges = np.histogram(values, bins=n_equal_bins) return bin_counts, bin_edges - def _merge_histogram(self, values: np.ndarray | pd.Series) -> None: + def _merge_histogram(self, values: np.ndarray | pl.Series) -> None: # values is the current array of values, # that needs to be updated to the accumulated histogram - if type(values) is pd.Series: - values = pl.from_pandas(values) + if type(values) == pl.Series: values = values.to_numpy() combined_values = np.concatenate([values, self._histogram_to_array()]) bin_counts, bin_edges = self._get_histogram(combined_values) self._stored_histogram["histogram"]["bin_counts"] = bin_counts self._stored_histogram["histogram"]["bin_edges"] = bin_edges - def _update_histogram(self, df_series: pd.Series | np.ndarray) -> None: + def _update_histogram(self, df_series: pl.Series) -> None: """ Update histogram for each method and the combined method. @@ -1352,30 +1346,31 @@ def _update_histogram(self, df_series: pd.Series | np.ndarray) -> None: accumulated losses, and the best method with minimal loss is picked :param df_series: a given column - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: """ - if self._greater_than_64_bit and type(df_series) is pd.Series: - df_series = df_series.to_numpy(dtype=float) - df_series = df_series[np.isfinite(df_series)] - if df_series.size == 0: + if self._greater_than_64_bit: + df_np_series = df_series.to_numpy() + df_np_series = df_np_series[np.isfinite(df_np_series)] + if df_np_series.size == 0: return + if self._has_histogram: + self._merge_histogram(df_np_series) + else: + bin_counts, bin_edges = self._get_histogram(df_np_series) + self._stored_histogram["histogram"]["bin_counts"] = bin_counts + self._stored_histogram["histogram"]["bin_edges"] = bin_edges else: - df_series = pl.from_pandas(df_series, nan_to_null=True).cast(pl.Float64) - df_series = df_series.replace([np.inf, -np.inf], [None]) # type: ignore - df_series = df_series.drop_nulls() + df_series = df_series.filter(~df_series.is_infinite()) + df_series = df_series.drop_nans() if df_series.is_empty(): return - - if self._has_histogram: - if self._greater_than_64_bit: - self._merge_histogram(df_series.tolist()) + if self._has_histogram: + self._merge_histogram(df_series) else: - self._merge_histogram(df_series.to_list()) - else: - bin_counts, bin_edges = self._get_histogram(df_series) - self._stored_histogram["histogram"]["bin_counts"] = bin_counts - self._stored_histogram["histogram"]["bin_edges"] = bin_edges + bin_counts, bin_edges = self._get_histogram(df_series) + self._stored_histogram["histogram"]["bin_counts"] = bin_counts + self._stored_histogram["histogram"]["bin_edges"] = bin_edges # update loss for the stored bins histogram_loss = self._histogram_bin_error(df_series) @@ -1749,36 +1744,30 @@ def _get_quantiles(self) -> None: ] self.quantiles = self._get_percentile(percentiles=percentiles) - def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: + def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: """ Update base numerical profile properties w/ clean dataset and known null params. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: numerical profile dictionary :type profile: dict :return: None """ - self._greater_than_64_bit = ( - not df_series_clean.empty - and df_series_clean.apply(pd.to_numeric, errors="coerce").dtype == "O" - ) + self._greater_than_64_bit = df_series_clean.dtype == pl.Object if self._greater_than_64_bit: - df_series_clean = df_series_clean.to_numpy() - df_series_clean = df_series_clean[df_series_clean != np.nan] - if df_series_clean.size == 0: + df_np_series_clean = df_series_clean.to_numpy() + df_np_series_clean = df_np_series_clean[df_np_series_clean != np.nan] + if df_np_series_clean.size == 0: return - df_series_clean = pd.Series(df_series_clean) + df_series_clean = pl.Series(df_np_series_clean) else: - df_series_clean = pl.from_pandas(df_series_clean) if df_series_clean.dtype == pl.String: df_series_clean = df_series_clean.str.strip_chars().cast(pl.Float64) else: df_series_clean = df_series_clean.cast(pl.Float64) if df_series_clean.is_empty(): return - df_series_clean = df_series_clean.to_pandas() - df_series_clean = df_series_clean.astype(float) prev_dependent_properties = { "mean": self.mean, @@ -1800,15 +1789,14 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: @BaseColumnProfiler._timeit(name="min") def _get_min( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if self._greater_than_64_bit: - min_value = np.min(df_series) + min_value = min(df_series) self.min = min_value if not self.min else min(self.min, min_value) else: - df_series = pl.from_pandas(df_series) min_value = df_series.min() self.min = np.float64( min_value if not self.min else min(self.min, min_value) @@ -1818,15 +1806,14 @@ def _get_min( @BaseColumnProfiler._timeit(name="max") def _get_max( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if self._greater_than_64_bit: - max_value = np.max(df_series) + max_value = max(df_series) self.max = max_value if not self.max else max(self.max, max_value) else: - df_series = pl.from_pandas(df_series) max_value = df_series.max() if self.max is not None: max_value = type(self.max)(max_value) @@ -1838,14 +1825,14 @@ def _get_max( @BaseColumnProfiler._timeit(name="sum") def _get_sum( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: if np.isinf(self.sum) or (np.isnan(self.sum) and self.match_count > 0): return if self._greater_than_64_bit: - sum_value = np.sum(df_series) + sum_value = float(sum(df_series)) if len(df_series) > 0 and sum_value == np.nan: warnings.warn( "Infinite or invalid values found in data. " @@ -1854,7 +1841,6 @@ def _get_sum( RuntimeWarning, ) else: - df_series = pl.from_pandas(df_series) sum_value = df_series.sum() if np.isinf(sum_value) or (len(df_series) > 0 and np.isnan(sum_value)): warnings.warn( @@ -1870,7 +1856,7 @@ def _get_sum( @BaseColumnProfiler._timeit(name="variance") def _get_variance( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1879,9 +1865,8 @@ def _get_variance( ): return if self._greater_than_64_bit: - batch_biased_variance = np.var(df_series) + batch_biased_variance = np.var(df_series.to_numpy()) else: - df_series = pl.from_pandas(df_series) batch_biased_variance = np.var([df_series]) subset_properties["biased_variance"] = batch_biased_variance sum_value = subset_properties["sum"] @@ -1900,7 +1885,7 @@ def _get_variance( @BaseColumnProfiler._timeit(name="skewness") def _get_skewness( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1908,7 +1893,7 @@ def _get_skewness( Compute and update skewness of current dataset given new chunk. :param df_series: incoming data - :type df_series: pandas series + :type df_series: polars series :param prev_dependent_properties: pre-update values needed for computation :type prev_dependent_properties: dict @@ -1924,11 +1909,10 @@ def _get_skewness( ): return - if self._greater_than_64_bit and type(df_series) is pd.Series: - df_series = df_series.to_numpy(dtype=float) + if self._greater_than_64_bit and type(df_series) is pl.Series: + batch_biased_skewness = profiler_utils.biased_skew(df_series.to_numpy()) else: - df_series = pl.from_pandas(df_series, nan_to_null=False) - batch_biased_skewness = profiler_utils.biased_skew(df_series) + batch_biased_skewness = profiler_utils.biased_skew(df_series) subset_properties["biased_skewness"] = batch_biased_skewness batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] @@ -1948,7 +1932,7 @@ def _get_skewness( @BaseColumnProfiler._timeit(name="kurtosis") def _get_kurtosis( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -1956,7 +1940,7 @@ def _get_kurtosis( Compute and update kurtosis of current dataset given new chunk. :param df_series: incoming data - :type df_series: pandas series + :type df_series: polars series :param prev_dependent_properties: pre-update values needed for computation :type prev_dependent_properties: dict @@ -1972,11 +1956,10 @@ def _get_kurtosis( ): return - if self._greater_than_64_bit and type(df_series) is pd.Series: - df_series = df_series.to_numpy(dtype=float) + if self._greater_than_64_bit: + batch_biased_kurtosis = profiler_utils.biased_kurt(df_series.to_numpy()) else: - df_series = pl.from_pandas(df_series, nan_to_null=False) - batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) + batch_biased_kurtosis = profiler_utils.biased_kurt(df_series) subset_properties["biased_kurtosis"] = batch_biased_kurtosis batch_count = subset_properties["match_count"] batch_biased_var = subset_properties["biased_variance"] @@ -1999,7 +1982,7 @@ def _get_kurtosis( @BaseColumnProfiler._timeit(name="histogram_and_quantiles") def _get_histogram_and_quantiles( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -2017,7 +2000,7 @@ def _get_histogram_and_quantiles( @BaseColumnProfiler._timeit(name="num_zeros") def _get_num_zeros( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -2025,23 +2008,26 @@ def _get_num_zeros( Get the count of zeros in the numerical column. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :param prev_dependent_properties: previous dependent properties :type prev_dependent_properties: dict :param subset_properties: subset of properties :type subset_properties: dict :return: None """ - if not self._greater_than_64_bit: - df_series = pl.from_pandas(df_series) - num_zeros_value = (df_series == 0).sum() + if df_series.is_empty(): + num_zeros_value = 0 + elif self._greater_than_64_bit: + num_zeros_value = int((df_series.to_numpy() == 0).sum()) + else: + num_zeros_value = int((df_series == 0).sum()) subset_properties["num_zeros"] = num_zeros_value self.num_zeros = self.num_zeros + num_zeros_value @BaseColumnProfiler._timeit(name="num_negatives") def _get_num_negatives( self, - df_series: pd.Series | np.ndarray, + df_series: pl.Series, prev_dependent_properties: dict, subset_properties: dict, ) -> None: @@ -2049,16 +2035,19 @@ def _get_num_negatives( Get the count of negative numbers in the numerical column. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :param prev_dependent_properties: previous dependent properties :type prev_dependent_properties: dict :param subset_properties: subset of properties :type subset_properties: dict :return: None """ - if not self._greater_than_64_bit: - df_series = pl.from_pandas(df_series) - num_negatives_value = (df_series < 0).sum() + if df_series.is_empty(): + num_negatives_value = 0 + elif self._greater_than_64_bit: + num_negatives_value = int((df_series.to_numpy() < 0).sum()) + else: + num_negatives_value = int((df_series < 0).sum()) subset_properties["num_negatives"] = num_negatives_value self.num_negatives = self.num_negatives + num_negatives_value @@ -2068,7 +2057,7 @@ def update(self, df_series: pl.Series) -> NumericStatsMixin: Update the numerical profile properties with an uncleaned dataset. :param df_series: df series with nulls removed - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: None """ raise NotImplementedError() diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index f2ea321e..200bd5d3 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -141,7 +141,7 @@ def _update_vocab( Find the unique vocabulary used in the text column. :param data: list or array of data from which to extract vocab - :type data: Union[list, numpy.array, pandas.DataFrame] + :type data: Union[list, numpy.array, polars.DataFrame] :param prev_dependent_properties: Contains all the previous properties that the calculations depend on. :type prev_dependent_properties: dict @@ -158,16 +158,14 @@ def _update_helper(self, df_series_clean: pl.Series, profile: dict) -> None: Update col profile properties with clean dataset and its known null parameters. :param df_series_clean: df series with nulls removed - :type df_series_clean: pandas.core.series.Series + :type df_series_clean: polars.series.series.Series :param profile: text profile dictionary :type profile: dict :return: None """ if self._NumericStatsMixin__calculations: text_lengths = df_series_clean.str.len_chars() - NumericStatsMixin._update_helper( - self, text_lengths.drop_nulls().to_pandas(), profile - ) + NumericStatsMixin._update_helper(self, text_lengths.drop_nulls(), profile) self._update_column_base_properties(profile) if self.max: self.type = "string" if self.max <= 255 else "text" @@ -177,7 +175,7 @@ def update(self, df_series: pl.Series) -> TextColumn: Update the column profile. :param df_series: df series - :type df_series: pandas.core.series.Series + :type df_series: polars.series.series.Series :return: updated TextColumn :rtype: TextColumn """ diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index 7b4d2ccc..a1291276 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -6,7 +6,7 @@ from unittest import mock import numpy as np -import pandas as pd +import polars as pl from dataprofiler.profilers import NumericStatsMixin from dataprofiler.profilers.base_column_profilers import BaseColumnProfiler @@ -340,7 +340,7 @@ def test_timeit(self): "biased_skewness": 0, } data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) subset_properties = {"min": 0, "match_count": 0} time_array = [float(i) for i in range(24, 0, -1)] @@ -547,21 +547,21 @@ def test_num_zeros(self): prev_dependent_properties = {"mean": 0} subset_properties = {"num_zeros": 0} - df_series = pd.Series([]) + df_series = pl.Series([]) num_profiler._get_num_zeros( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_zeros"], 0) data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_zeros( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_zeros"], 5) data = np.array([000.0, 0.00, 0.000, 1.11234, 0, -1]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_zeros( df_series, prev_dependent_properties, subset_properties ) @@ -574,21 +574,21 @@ def test_num_negatives(self): prev_dependent_properties = {"mean": 0} subset_properties = {"num_negatives": 0} - df_series = pd.Series([]) + df_series = pl.Series([]) num_profiler._get_num_negatives( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_negatives"], 0) data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_negatives( df_series, prev_dependent_properties, subset_properties ) self.assertEqual(subset_properties["num_negatives"], 0) data = np.array([1, 0, -0.003, -16, -1.0, -24.45]) - df_series = pd.Series(data) + df_series = pl.Series(data) num_profiler._get_num_negatives( df_series, prev_dependent_properties, subset_properties ) @@ -675,7 +675,7 @@ def test_timeit_num_zeros_and_negatives(self): # Dummy data to make min call prev_dependent_properties = {"mean": 0} data = np.array([0, 0, 0, 0, 0]) - df_series = pd.Series(data) + df_series = pl.Series(data) subset_properties = {"num_zeros": 0, "num_negatives": 0} time_array = [float(i) for i in range(4, 0, -1)]