From c527e3a61fc975a501f11874b9b8f6cca0bb05b4 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Thu, 7 Nov 2024 21:58:34 +0000 Subject: [PATCH] move time_helpers to shared_utils --- _shared_utils/shared_utils/__init__.py | 2 + _shared_utils/shared_utils/time_helpers.py | 78 +++++++++++++++++ .../segment_speed_utils/__init__.py | 2 - .../gtfs_schedule_wrangling.py | 6 +- .../segment_speed_utils/time_helpers.py | 84 ------------------- 5 files changed, 83 insertions(+), 89 deletions(-) create mode 100644 _shared_utils/shared_utils/time_helpers.py delete mode 100644 rt_segment_speeds/segment_speed_utils/time_helpers.py diff --git a/_shared_utils/shared_utils/__init__.py b/_shared_utils/shared_utils/__init__.py index b8b49a6b9..a0de7ba01 100644 --- a/_shared_utils/shared_utils/__init__.py +++ b/_shared_utils/shared_utils/__init__.py @@ -9,6 +9,7 @@ rt_dates, rt_utils, schedule_rt_utils, + time_helpers, ) __all__ = [ @@ -22,4 +23,5 @@ "rt_dates", "rt_utils", "schedule_rt_utils", + "time_helpers", ] diff --git a/_shared_utils/shared_utils/time_helpers.py b/_shared_utils/shared_utils/time_helpers.py new file mode 100644 index 000000000..29b3d0c1e --- /dev/null +++ b/_shared_utils/shared_utils/time_helpers.py @@ -0,0 +1,78 @@ +""" +Helpers for defining peak vs offpeak periods and +weekend and weekends so we can aggregate our +existing time-of-day bins. +""" +import datetime + +import pandas as pd + +PEAK_PERIODS = ["AM Peak", "PM Peak"] + +HOURS_BY_TIME_OF_DAY = { + "Owl": 4, # [0, 3] + "Early AM": 3, # [4, 6] + "AM Peak": 3, # [7, 9] + "Midday": 5, # [10, 14] + "PM Peak": 5, # [15, 19] + "Evening": 4, # [20, 23] +} + +TIME_OF_DAY_DICT = { + **{k: "peak" for k, v in HOURS_BY_TIME_OF_DAY.items() if k in PEAK_PERIODS}, + **{k: "offpeak" for k, v in HOURS_BY_TIME_OF_DAY.items() if k not in PEAK_PERIODS}, +} + +DAY_TYPE_DICT = { + 1: "Sunday", + 2: "Monday", + 3: "Tuesday", + 4: "Wednesday", + 5: "Thursday", + 6: "Friday", + 7: "Saturday", +} + +WEEKDAY_DICT = { + **{k: "weekday" for k in ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]}, + **{k: "weekend" for k in ["Saturday", "Sunday"]}, +} + + +def time_span_labeling(date_list: list) -> tuple[str]: + """ + If we grab a week's worth of trips, we'll + use this week's average to stand-in for the entire month. + Label with month and year. + """ + time_span_str = list(set([datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%b%Y").lower() for d in date_list])) + + time_span_num = list(set([datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%m_%Y").lower() for d in date_list])) + + if len(time_span_str) == 1: + return time_span_str[0], time_span_num[0] + + else: + print(f"multiple months: {time_span_str}") + return time_span_str, time_span_num + + +def add_time_span_columns(df: pd.DataFrame, time_span_num: str) -> pd.DataFrame: + """ + Add columns for month / year, use when we have aggregated time-series. + """ + month = int(time_span_num.split("_")[0]) + year = int(time_span_num.split("_")[1]) + + # Downgrade some dtypes for public bucket + df = df.assign( + month=month, + year=year, + ).astype( + { + "month": "int16", + "year": "int16", + } + ) + + return df diff --git a/rt_segment_speeds/segment_speed_utils/__init__.py b/rt_segment_speeds/segment_speed_utils/__init__.py index 6104fd7d6..4d10f7d0e 100644 --- a/rt_segment_speeds/segment_speed_utils/__init__.py +++ b/rt_segment_speeds/segment_speed_utils/__init__.py @@ -7,7 +7,6 @@ parallel_corridors, project_vars, segment_calcs, - time_helpers, time_series_utils, vp_transform, ) @@ -21,7 +20,6 @@ "parallel_corridors", "project_vars", "segment_calcs", - "time_helpers", "time_series_utils", "vp_transform", ] \ No newline at end of file diff --git a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py index d73c378bd..da2ac2b2c 100644 --- a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py +++ b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py @@ -1,14 +1,14 @@ """ All kinds of GTFS schedule table wrangling. """ +import dask.dataframe as dd import geopandas as gpd import pandas as pd -import dask.dataframe as dd from typing import Literal, Union -from segment_speed_utils import helpers, time_helpers -from shared_utils import portfolio_utils, rt_utils +from segment_speed_utils import helpers +from shared_utils import portfolio_utils, rt_utils, time_helpers from segment_speed_utils.project_vars import SEGMENT_GCS sched_rt_category_dict = { diff --git a/rt_segment_speeds/segment_speed_utils/time_helpers.py b/rt_segment_speeds/segment_speed_utils/time_helpers.py deleted file mode 100644 index af74842ae..000000000 --- a/rt_segment_speeds/segment_speed_utils/time_helpers.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -Helpers for defining peak vs offpeak periods and -weekend and weekends so we can aggregate our -existing time-of-day bins. -""" -import datetime -import pandas as pd - -PEAK_PERIODS = ["AM Peak", "PM Peak"] - -HOURS_BY_TIME_OF_DAY = { - "Owl": 4, #[0, 3] - "Early AM": 3, #[4, 6] - "AM Peak": 3, #[7, 9] - "Midday": 5, #[10, 14] - "PM Peak": 5, #[15, 19] - "Evening": 4 #[20, 23] -} - -TIME_OF_DAY_DICT = { - **{k: "peak" for k, v in HOURS_BY_TIME_OF_DAY.items() - if k in PEAK_PERIODS}, - **{k: "offpeak" for k, v in HOURS_BY_TIME_OF_DAY.items() - if k not in PEAK_PERIODS} -} - -DAY_TYPE_DICT = { - 1: "Sunday", - 2: "Monday", - 3: "Tuesday", - 4: "Wednesday", - 5: "Thursday", - 6: "Friday", - 7: "Saturday", -} - -WEEKDAY_DICT = { - **{k: "weekday" for k in ["Monday", "Tuesday", "Wednesday", - "Thursday", "Friday"]}, - **{k: "weekend" for k in ["Saturday", "Sunday"]} -} - -def time_span_labeling(date_list: list) -> tuple[str]: - """ - If we grab a week's worth of trips, we'll - use this week's average to stand-in for the entire month. - Label with month and year. - """ - time_span_str = list(set( - [datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%b%Y").lower() - for d in date_list] - )) - - time_span_num = list(set( - [datetime.datetime.strptime(d, "%Y-%m-%d").strftime("%m_%Y").lower() - for d in date_list] - )) - - if len(time_span_str) == 1: - return time_span_str[0], time_span_num[0] - - else: - print(f"multiple months: {time_span_str}") - return time_span_str, time_span_num - - -def add_time_span_columns( - df: pd.DataFrame, - time_span_num: str -) -> pd.DataFrame: - - month = int(time_span_num.split('_')[0]) - year = int(time_span_num.split('_')[1]) - - # Downgrade some dtypes for public bucket - df = df.assign( - month = month, - year = year, - ).astype({ - "month": "int16", - "year": "int16", - }) - - return df \ No newline at end of file