From 748d0c8ddb28399628cdee221f6ceab0f3af8426 Mon Sep 17 00:00:00 2001
From: ubd725 <aabajpai15@gmail.com>
Date: Tue, 27 Feb 2024 15:50:36 -0600
Subject: [PATCH] Add polars to datetime_column_profile

---
 .../profilers/datetime_column_profile.py      | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py
index af99283a9..7b2b57988 100644
--- a/dataprofiler/profilers/datetime_column_profile.py
+++ b/dataprofiler/profilers/datetime_column_profile.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 
 from . import profiler_utils
 from .base_column_profilers import BaseColumnPrimitiveTypeProfiler, BaseColumnProfiler
@@ -256,8 +257,7 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
         profile: dict = dict()
         activated_date_formats: list = list()
         len_df = len(df_series)
-
-        is_row_datetime = pd.Series(np.full((len(df_series)), False))
+        is_row_datetime = pd.Series(np.full((len_df), False))
 
         min_value = None
         max_value = None
@@ -275,18 +275,19 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
                 )
             )
 
-            df_dates = valid_dates[~valid_dates.isnull()]
+            df_dates = pl.Series(valid_dates[~valid_dates.isnull()])
 
-            if "%b" in date_format and not df_dates.empty:
+            if "%b" in date_format and not df_dates.is_empty():
                 may_month = 5  # May can be %b or %B we want to force, so check
-                all_may = df_dates.apply(lambda x: x.month == may_month).all()
+                all_may = df_dates.map_elements(lambda x: x.month == may_month)
+                all_may = pl.Series(all_may).all()
                 if all_may:
-                    valid_dates[:] = np.nan
-                    df_dates = pd.Series([], dtype=object)
+                    valid_dates[:] = None
+                    df_dates = pl.Series([])
 
             # Create mask to avoid null dates
             null_date_mask = valid_dates.isnull()
-            np_date_array = df_dates.values
+            np_date_array = df_dates.to_numpy()
 
             # check off any values which were found to be datetime
             is_row_datetime[~is_row_datetime] = (~null_date_mask).values
@@ -298,18 +299,18 @@ def _get_datetime_profile(cls, df_series: pd.Series) -> dict:
                 max_idx = np.argmax(np_date_array)
 
                 # Selects the min, ma value objects for comparison
-                tmp_min_value_obj = df_dates.iloc[min_idx]
-                tmp_max_value_obj = df_dates.iloc[max_idx]
+                tmp_min_value_obj = df_dates.item(int(min_idx))
+                tmp_max_value_obj = df_dates.item(int(max_idx))
 
                 # If minimum value, keep reference
                 if tmp_min_value_obj < min_value_obj:
                     min_value = df_series[~null_date_mask].iloc[min_idx]
-                    min_value_obj = tmp_min_value_obj
+                    min_value_obj = pd.Timestamp(tmp_min_value_obj)
 
                 # If maximum value, keep reference
                 if tmp_max_value_obj > max_value_obj:
                     max_value = df_series[~null_date_mask].iloc[max_idx]
-                    max_value_obj = tmp_max_value_obj
+                    max_value_obj = pd.Timestamp(tmp_max_value_obj)
 
             df_series = df_series[null_date_mask]