Skip to content

Commit

Permalink
Encoding time in Expr function and process_data_cluster in lazy mode #…
Browse files Browse the repository at this point in the history
…153

Signed-off-by: Armand <[email protected]>
  • Loading branch information
armgilles committed Oct 9, 2024
1 parent c2c78aa commit 01f2bd5
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 24 deletions.
39 changes: 18 additions & 21 deletions src/vcub_keeper/transform/features_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,41 +167,39 @@ def fast_parse_date_(s):
return data


def get_encoding_time(data: pl.DataFrame, col_date: str, max_val: int) -> pl.DataFrame:
def get_encoding_time(col_date: str, max_val: int) -> list[pl.Expr]:
"""
Encoding time
Parameters
----------
data : DataFrame
Activité des stations Vcub
col_date : str
Nom de la colonne à encoder
max_val : int
Valeur maximal que la valeur peut avoir (ex 12 pour le mois)
Returns
-------
data : DataFrame
Ajout de colonne Sin_[col_date] & Cos_[col_date]
list[pl.Expr]
Examples
Example
--------
data = get_encoding_time(data, 'month', max_val=12)
encoding_quarter_expr = get_encoding_time("quarter", max_val=4)
data.with_columns(*encoding_quarter_expr)
"""

two_pi = 2 * np.pi
expr_two_pi_div_max_val = pl.lit(two_pi / max_val)
data = data.with_columns(
[
(expr_two_pi_div_max_val * pl.col(col_date)).sin().alias("Sin_" + col_date),
(expr_two_pi_div_max_val * pl.col(col_date)).cos().alias("Cos_" + col_date),
]
)
return data

# Création des expressions pour Sin et Cos
sin_expr = (expr_two_pi_div_max_val * pl.col(col_date)).sin().alias(f"Sin_{col_date}")
cos_expr = (expr_two_pi_div_max_val * pl.col(col_date)).cos().alias(f"Cos_{col_date}")

def process_data_cluster(data: pl.DataFrame) -> pl.DataFrame:
return [sin_expr, cos_expr]


def process_data_cluster(data: pl.LazyFrame) -> pl.LazyFrame:
"""
Process some Feature engineering
Expand All @@ -212,7 +210,7 @@ def process_data_cluster(data: pl.DataFrame) -> pl.DataFrame:
Returns
-------
data : DataFrame
data : LazyFrame
Add some columns in DataFrame
Examples
Expand All @@ -223,15 +221,14 @@ def process_data_cluster(data: pl.DataFrame) -> pl.DataFrame:
data = data.with_columns(
[
pl.col("date").dt.quarter().alias("quarter"),
# pl.col("date").dt.month().alias("month"),
pl.col("date").dt.weekday().alias("weekday"),
pl.col("date").dt.hour().alias("hours"),
]
)

data = get_encoding_time(data, "quarter", max_val=4)
# data = get_encoding_time(data, 'month', max_val=12)
data = get_encoding_time(data, "weekday", max_val=7)
data = get_encoding_time(data, "hours", max_val=24)
encoding_quarter_expr = get_encoding_time("quarter", max_val=4)
encoding_weekday_expr = get_encoding_time("weekday", max_val=7)
encoding_hours_expr = get_encoding_time("hours", max_val=24)
data = data.with_columns(*encoding_quarter_expr, *encoding_weekday_expr, *encoding_hours_expr)

return data
9 changes: 6 additions & 3 deletions tests/test_transf_transaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,8 @@ def test_get_encoding_time_quarter():
}
)

result = get_encoding_time(data, "quarter", max_val=4)
encoding_quarter_expr = get_encoding_time("quarter", max_val=4)
result = data.lazy().with_columns(*encoding_quarter_expr).collect()

expected_sin = np.sin(2 * np.pi * data["quarter"] / 4)
expected_cos = np.cos(2 * np.pi * data["quarter"] / 4)
Expand All @@ -252,7 +253,8 @@ def test_get_encoding_time_weekday():
}
)

result = get_encoding_time(data, "weekday", max_val=7)
encoding_weekday_expr = get_encoding_time("weekday", max_val=7)
result = data.lazy().with_columns(*encoding_weekday_expr).collect()

expected_sin = np.sin(2 * np.pi * data["weekday"] / 7)
expected_cos = np.cos(2 * np.pi * data["weekday"] / 7)
Expand All @@ -274,7 +276,8 @@ def test_get_encoding_time_hours():
}
)

result = get_encoding_time(data, "hours", max_val=24)
encoding_hours_expr = get_encoding_time("hours", max_val=24)
result = data.lazy().with_columns(*encoding_hours_expr).collect()

expected_sin = np.sin(2 * np.pi * data["hours"] / 24)
expected_cos = np.cos(2 * np.pi * data["hours"] / 24)
Expand Down

0 comments on commit 01f2bd5

Please sign in to comment.