Skip to content

Commit

Permalink
Add polars to datetime_column_profile
Browse files Browse the repository at this point in the history
  • Loading branch information
abajpai15 committed Feb 27, 2024
1 parent 19c7ebe commit 748d0c8
Showing 1 changed file with 13 additions and 12 deletions.
25 changes: 13 additions & 12 deletions dataprofiler/profilers/datetime_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import pandas as pd
import polars as pl

from . import profiler_utils
from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler
Expand Down Expand Up @@ -256,8 +257,7 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
profile: dict = dict()
activated_date_formats: list = list()
len_df = len(df_series)

is_row_datetime = pd.Series(np.full((len(df_series)), False))
is_row_datetime = pd.Series(np.full((len_df), False))

min_value = None
max_value = None
Expand All @@ -275,18 +275,19 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
)
)

df_dates = valid_dates[~valid_dates.isnull()]
df_dates = pl.Series(valid_dates[~valid_dates.isnull()])

if "%b" in date_format and not df_dates.empty:
if "%b" in date_format and not df_dates.is_empty():
may_month = 5 # May can be %b or %B we want to force, so check
all_may = df_dates.apply(lambda x: x.month == may_month).all()
all_may = df_dates.map_elements(lambda x: x.month == may_month)
all_may = pl.Series(all_may).all()
if all_may:
valid_dates[:] = np.nan
df_dates = pd.Series([], dtype=object)
valid_dates[:] = None
df_dates = pl.Series([])

# Create mask to avoid null dates
null_date_mask = valid_dates.isnull()
np_date_array = df_dates.values
np_date_array = df_dates.to_numpy()

# check off any values which were found to be datetime
is_row_datetime[~is_row_datetime] = (~null_date_mask).values
Expand All @@ -298,18 +299,18 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
max_idx = np.argmax(np_date_array)

# Selects the min, ma value objects for comparison
tmp_min_value_obj = df_dates.iloc[min_idx]
tmp_max_value_obj = df_dates.iloc[max_idx]
tmp_min_value_obj = df_dates.item(int(min_idx))
tmp_max_value_obj = df_dates.item(int(max_idx))

# If minimum value, keep reference
if tmp_min_value_obj < min_value_obj:
min_value = df_series[~null_date_mask].iloc[min_idx]
min_value_obj = tmp_min_value_obj
min_value_obj = pd.Timestamp(tmp_min_value_obj)

# If maximum value, keep reference
if tmp_max_value_obj > max_value_obj:
max_value = df_series[~null_date_mask].iloc[max_idx]
max_value_obj = tmp_max_value_obj
max_value_obj = pd.Timestamp(tmp_max_value_obj)

df_series = df_series[null_date_mask]

Expand Down

0 comments on commit 748d0c8

Please sign in to comment.