diff --git a/_shared_utils/shared_utils/gtfs_analytics_data.yml b/_shared_utils/shared_utils/gtfs_analytics_data.yml index ea7561ded..48cf97884 100644 --- a/_shared_utils/shared_utils/gtfs_analytics_data.yml +++ b/_shared_utils/shared_utils/gtfs_analytics_data.yml @@ -68,6 +68,7 @@ digest_tables: operator_profiles: "digest/operator_profiles" operator_routes_map: "digest/operator_routes" operator_sched_rt: "digest/operator_schedule_rt_category" + operator_metrics: "digest/operator_metrics" scheduled_service_hours: "digest/total_scheduled_service_hours" stop_segments: diff --git a/gtfs_digest/03_report.ipynb b/gtfs_digest/03_report.ipynb index 94e1a4530..e5ab9fea9 100644 --- a/gtfs_digest/03_report.ipynb +++ b/gtfs_digest/03_report.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "6bd20d9d-a3af-430e-8c19-c90fb8ef9e62", "metadata": { "tags": [ @@ -60,20 +60,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "d870c492-ef2c-45f6-ab47-8d46eda7f344", "metadata": { "tags": [] }, "outputs": [], "source": [ - "%%capture_parameters\n", - "organization_name" + " %%capture_parameters\n", + " organization_name" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "b8e11fd2-041f-4e1d-a00f-6e000269c1a7", "metadata": {}, "outputs": [], @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "8e840f91-2e1a-4235-bf6b-0c049a569b4a", "metadata": {}, "outputs": [], @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "517702ae-a7ac-4cc4-a2d4-158fdc8d6919", "metadata": {}, "outputs": [], @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "dd8b7a4c-7682-4949-9e9b-990ce6867627", "metadata": {}, "outputs": [], @@ -114,6 +114,20 @@ "scheduled_service = section1.load_operator_service_hours(name)" ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "72641974-01d5-47cf-b963-baf546e9e958", + "metadata": {}, + "outputs": [], + "source": [ + "# Dataset with agency metrics\n", + "try:\n", + " agency_metrics_df = section2.load_operator_metrics(organization_name)\n", + "except:\n", + " pass" + ] + }, { "cell_type": "code", "execution_count": 10, @@ -342,23 +356,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1094,7 +1108,7 @@ ], "source": [ "try:\n", - " display(section2.agency_overview(sched_vp_df))\n", + " display(section2.agency_overview(agency_metrics_df))\n", "except:\n", " display(Markdown(f\"\"\"{organization_name} only has schedule data.\"\"\"))" ] @@ -1118,23 +1132,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agency_spatial_chart" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "80222431-fb3c-4528-8cd9-ca7af926466d", "metadata": {}, "outputs": [], "source": [ - "def simple_bar_chart(\n", - " df: pd.DataFrame,\n", - " y_col: str,\n", - " ruler_col: str,\n", - " title: str,\n", - " subtitle: str,\n", - " domain_color:list,\n", - " range_color:list,\n", - ") -> alt.Chart:\n", - " tooltip_cols = [\n", - " \"Date\",\n", - " y_col,\n", - " ]\n", - " \n", - " # Set y-axis\n", - " max_y = _section2_utils.set_y_axis(df, y_col)\n", - " \n", - " # Create color scale\n", - " color_scale = alt.Scale(\n", - " domain= domain_color,\n", - " range = range_color\n", - " )\n", - " \n", - " # Create ruler\n", - " ruler = (\n", - " alt.Chart(df)\n", - " .mark_rule(color=\"red\", strokeDash=[10, 7])\n", - " .encode(y=f\"mean({ruler_col}):Q\")\n", - " )\n", - " \n", - " chart = (\n", - " alt.Chart(df)\n", - " .mark_bar(size=7, clip=True)\n", - " .encode(\n", - " x=alt.X(\n", - " \"yearmonthdate(Date):O\",\n", - " title=[\"Date\"],\n", - " axis=alt.Axis(labelAngle=-45, format=\"%b %Y\"),\n", - " ),\n", - " y=alt.Y(\n", - " f\"{y_col}:Q\",\n", - " title=_report_utils.labeling(y_col),\n", - " scale=alt.Scale(domain=[0, max_y]),\n", - " ),\n", - " color=alt.Color(\n", - " f\"{y_col}:Q\",\n", - " title=_report_utils.labeling(y_col),\n", - " scale=color_scale,\n", - " ),\n", - " tooltip=df[tooltip_cols].columns.tolist(),\n", + "agency_vp_chart = (\n", + " (\n", + " section2.simple_bar_chart(\n", + " df,\n", + " \"VP per Minute (All Routes)\",\n", + " \"ruler_for_vp_per_min\",\n", + " readable_dict[\"agency_vp_per_min_graph\"][\"title\"],\n", + " readable_dict[\"vp_per_min_graph\"][\"subtitle\"],\n", + " color_dict[\"vp_domain\"],\n", + " color_dict[\"vp_range\"]\n", " )\n", " )\n", - "\n", - " chart = (chart + ruler).properties(width=400, height=250,\n", - " title={\n", - " \"text\": title,\n", - " \"subtitle\": [subtitle],\n", - " }\n", - " )\n", - "\n", - " return chart" + " )" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "cafb446d-f0a6-4130-9bea-603081a967d7", + "execution_count": 12, + "id": "b93659f6-5b53-4384-ae26-eb946a2bf993", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agency_vp_chart" + ] + }, + { + "cell_type": "markdown", + "id": "9dbdbd09-42b1-470e-8b91-3b8cdcd11ee7", + "metadata": {}, + "source": [ + "### Look at City of Visalia\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d8456dd5-9ea0-459f-8af0-4101eacbede7", "metadata": {}, "outputs": [], "source": [ - "def agency_overview(df:pd.DataFrame)->alt.Chart:\n", - " agg1 = aggregate_by_agency(df)\n", - " \n", - " agency_spatial_chart = (\n", - " simple_bar_chart(\n", - " agg1,\n", + "df2 = load_operator_metrics(\"City of Visalia\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "15f80dda-e628-4bc9-bf5d-160411490bd1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " (\n", + " section2.simple_bar_chart(\n", + " df2,\n", + " \"VP per Minute (All Routes)\",\n", + " \"ruler_for_vp_per_min\",\n", + " readable_dict[\"agency_vp_per_min_graph\"][\"title\"],\n", + " readable_dict[\"vp_per_min_graph\"][\"subtitle\"],\n", + " color_dict[\"vp_domain\"],\n", + " color_dict[\"vp_range\"]\n", + " )\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "14559142-ba86-4a4b-880c-1b7578ae5b66", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " section2.simple_bar_chart(\n", + " df2,\n", " \"Spatial Accuracy (All Routes)\",\n", " \"ruler_100_pct\",\n", " readable_dict[\"agency_spatial_accuracy\"][\"title\"],\n", @@ -200,11 +567,117 @@ " color_dict[\"spatial_accuracy_range\"]\n", " )\n", " )\n", - " \n", - " agency_vp_chart = (\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "41420e6e-318c-4006-a475-bb7709d93820", + "metadata": {}, + "source": [ + "### Orange County Transportation Authority" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8c9bffb8-5e74-4592-984c-514ab8a3f166", + "metadata": {}, + "outputs": [], + "source": [ + "df3 = load_operator_metrics(\"Orange County Transportation Authority\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "46025ae3-59c6-471c-bb3c-6f2f5838ba99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", " (\n", - " simple_bar_chart(\n", - " agg1,\n", + " section2.simple_bar_chart(\n", + " df3,\n", " \"VP per Minute (All Routes)\",\n", " \"ruler_for_vp_per_min\",\n", " readable_dict[\"agency_vp_per_min_graph\"][\"title\"],\n", @@ -213,19 +686,13 @@ " color_dict[\"vp_range\"]\n", " )\n", " )\n", - " )\n", - " \n", - " chart_list = [agency_spatial_chart, agency_vp_chart]\n", - " chart = alt.vconcat(*chart_list).resolve_scale(\n", - " color='independent')\n", - "\n", - " return chart" + " )" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "65259c01-bd91-4200-9572-bfdd5e7c6f98", + "execution_count": 18, + "id": "af863f21-ba7d-4980-ab56-c6fe6b20c087", "metadata": {}, "outputs": [ { @@ -233,23 +700,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ - "alt.VConcatChart(...)" + "alt.LayerChart(...)" ] }, - "execution_count": 9, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "agency_overview(df)" + "(\n", + " section2.simple_bar_chart(\n", + " df3,\n", + " \"Spatial Accuracy (All Routes)\",\n", + " \"ruler_100_pct\",\n", + " readable_dict[\"agency_spatial_accuracy\"][\"title\"],\n", + " readable_dict[\"spatial_accuracy_graph\"][\"subtitle\"],\n", + " color_dict[\"spatial_accuracy_domain\"],\n", + " color_dict[\"spatial_accuracy_range\"]\n", + " )\n", + " )\n", + " " ] } ], diff --git a/gtfs_digest/35_agg_by_agency2.ipynb b/gtfs_digest/35_agg_by_agency2.ipynb new file mode 100644 index 000000000..bc29f1a40 --- /dev/null +++ b/gtfs_digest/35_agg_by_agency2.ipynb @@ -0,0 +1,70 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2682013d-2d46-4584-a421-38b4f9fe9a13", + "metadata": {}, + "source": [ + "## Use `vp_trips as the jumping off point`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "49127032-4559-415b-b7c9-9d3fb9e8c50c", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'shared_utils'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_aggregate_agency\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_report_utils\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_section1_utils\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msection1\u001b[39;00m\n", + "File \u001b[0;32m~/data-analyses/gtfs_digest/_aggregate_agency.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_operators_prep\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgeopandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mgpd\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n", + "File \u001b[0;32m~/data-analyses/gtfs_digest/_operators_prep.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mshared_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m catalog_utils\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'shared_utils'" + ] + } + ], + "source": [ + "import _aggregate_agency\n", + "import _report_utils\n", + "import _section1_utils as section1\n", + "import _section2_utils as section2\n", + "import geopandas as gpd\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "993d3d22-1fd2-46c2-a288-85cd5c0021fa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/gtfs_digest/_archive_section2_utils.py b/gtfs_digest/_archive_section2_utils.py deleted file mode 100644 index 219f8fb2b..000000000 --- a/gtfs_digest/_archive_section2_utils.py +++ /dev/null @@ -1,1009 +0,0 @@ -import calitp_data_analysis.magics -import geopandas as gpd -import pandas as pd - -# Charts -from calitp_data_analysis import calitp_color_palette as cp -import altair as alt -alt.data_transformers.enable('default', max_rows=None) - -# Great Tables -import great_tables as gt -from great_tables import md - -# Display -from IPython.display import HTML, Markdown, display - -# Other -from segment_speed_utils.project_vars import RT_SCHED_GCS, SCHED_GCS -from shared_utils import catalog_utils, rt_dates, rt_utils -GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") - -import _report_utils -""" -Schedule_vp_metrics -Functions -""" -def timeliness_tags(row): - if row.rt_sched_journey_ratio < 1: - return "Early" - elif row.rt_sched_journey_ratio < 1.1: - return "On Time" - elif 1.1 <= row.rt_sched_journey_ratio < 1.26: - return "Late by 1-25% of the scheduled time" - elif 1.26 <= row.rt_sched_journey_ratio < 1.51: - return "Late by 26-50% of the scheduled time" - elif 1.51 <= row.rt_sched_journey_ratio: - return "Late by 50%+ of the scheduled time" - else: - return "No Info" - -def frequency_tags(row): - if row.frequency < 2: - return "<1 trip/hour" - elif 1 <= row.frequency < 2: - return "1 trip/hour" - elif 2 <= row.frequency < 3: - return "2 trips/hour" - elif 3 <= row.frequency: - return "3+ trips/hour" - else: - return "No Info" - - -def vp_per_min_tag(row): - if row.vp_per_minute < 1: - return "<1 ping/minute" - elif 1 <= row.vp_per_minute < 2: - return "<3 pings/minute" - elif 2 <= row.vp_per_minute < 3: - return "<3 pings/minute" - elif 3 <= row.vp_per_minute: - return "3+ pings per minute (target)" - else: - return "No Info" - -def add_categories(df:pd.DataFrame) -> pd.DataFrame: - df["rt_sched_journey_ratio_cat"] = df.apply(timeliness_tags, axis=1) - df["frequency_cat"] = df.apply(frequency_tags, axis=1) - df["vp_per_minute_cat"] = df.apply(vp_per_min_tag, axis=1) - - return df - -def load_schedule_vp_metrics(name:str)->pd.DataFrame: - schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet" - - df = pd.read_parquet(schd_vp_url, filters=[[("name", "==", name)]]) - - # Categorize - df = add_categories(df) - - # Round float columns - float_columns = df.select_dtypes(include=['float']) - for i in float_columns: - df[i] = df[i].round(2) - - pct_cols = df.columns[df.columns.str.contains("pct")].tolist() - for i in pct_cols: - df[i] = df[i] * 100 - - # Add rulers - df["ruler_100_pct"] = 100 - df["ruler_for_vp_per_min"] = 2 - return df - -def route_stats(df: pd.DataFrame) -> pd.DataFrame: - most_recent_date = df.service_date.max() - route_merge_cols = ["route_combined_name", "direction_id"] - - all_day_stats = df[ - (df.service_date == most_recent_date) & (df.time_period == "all_day") - ][ - route_merge_cols - + [ - "avg_scheduled_service_minutes", - "avg_stop_miles", - "n_scheduled_trips", - "sched_rt_category", - ] - ] - - peak_stats = df[(df.service_date == most_recent_date) & (df.time_period == "peak")][ - route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"] - ].rename( - columns={ - "speed_mph": "peak_avg_speed", - "n_scheduled_trips": "peak_scheduled_trips", - "frequency": "peak_hourly_freq", - } - ) - - offpeak_stats = df[ - (df.service_date == most_recent_date) & (df.time_period == "offpeak") - ][route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]].rename( - columns={ - "speed_mph": "offpeak_avg_speed", - "n_scheduled_trips": "offpeak_scheduled_trips", - "frequency": "offpeak_hourly_freq", - } - ) - - table_df = ( - pd.merge( - all_day_stats, - peak_stats, - on=route_merge_cols, - how = "outer" - ) - .merge(offpeak_stats, on=route_merge_cols, how = "outer") - .sort_values(["route_combined_name", "direction_id"]) - .reset_index(drop=True) - ) - - numeric_cols = table_df.select_dtypes(include="number").columns - table_df[numeric_cols] = table_df[numeric_cols].fillna(0) - - return table_df - -def timeliness_trips(df: pd.DataFrame): - to_keep = [ - "service_date", - "organization_name", - "direction_id", - "time_period", - "route_combined_name", - "is_early", - "is_ontime", - "is_late", - "n_vp_trips", - ] - df = df[to_keep] - df2 = df.loc[df.time_period != "all_day"].reset_index(drop=True) - - melted_df = df2.melt( - id_vars=[ - "service_date", - "organization_name", - "route_combined_name", - "time_period", - "direction_id", - ], - value_vars=["is_early", "is_ontime", "is_late"], - ) - return melted_df - -def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame: - to_keep = [ - "service_date", - "organization_name", - "direction_id", - col1, - col2, - "route_combined_name", - "time_period", - "route_id", - "ruler_100_pct", - ] - df2 = df[to_keep] - - df3 = df2.melt( - id_vars=[ - "service_date", - "organization_name", - "route_combined_name", - "direction_id", - "time_period", - "route_id", - "ruler_100_pct", - ], - value_vars=[col1, col2], - ) - - return df3 - -""" -Operator Level -""" -def trips_by_gtfs(df): - df = df.loc[df.time_period=="all_day"] - - by_date_category = ( - pd.crosstab( - df.service_date, - df.sched_rt_category, - values=df.n_scheduled_trips, - aggfunc="sum", - ) - .reset_index() - .fillna(0)) - - display(gt.GT(by_date_category, rowname_col="service_date") - .tab_header( - title="Daily Trips by GTFS Availability", - subtitle="Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data.", - ) - .cols_label( - schedule_only="Schedule Only", - vp_only="VP Only", - schedule_and_vp="Schedule and VP", - ) - .fmt_integer(["schedule_only", "vp_only", "schedule_and_vp"]) - .tab_options(container_width="75%") - .tab_options(table_font_size="12px")) - -""" -operator_schedule_rt_category -""" -def load_operator_schedule_rt_category(schedule_gtfs_key: list) -> pd.DataFrame: - df = pd.read_parquet( - f"{RT_SCHED_GCS}digest/operator_schedule_rt_category.parquet", - filters=[[("schedule_gtfs_dataset_key", "in", schedule_gtfs_key)]], - ) - df.n_trips = df.n_trips.astype(int).fillna(0) - return df - - -""" -Charts -""" -def create_data_unavailable_chart(): - data = pd.DataFrame({"text": ["Chart unavailable, not enough data."]}) - - # Create a text chart using Altair - chart = ( - alt.Chart(data) - .mark_text( - align="center", - baseline="middle", - fontSize=12, - text="Chart unavailable due to lack of data", - ) - .properties(width=500, height=100) - ) - - return chart - -def clean_data_charts(df:pd.DataFrame, y_col:str)->pd.DataFrame: - df = df.assign( - time_period=df.time_period.str.replace("_", " ").str.title() - ).reset_index(drop=True) - - df[y_col] = df[y_col].fillna(0).astype(int) - df[f"{y_col}_str"] = df[y_col].astype(str) - - - return df - -def grouped_bar_chart( - df: pd.DataFrame, - color_col: str, - y_col: str, - offset_col: str, - title: str, - subtitle: str, -): - tooltip_cols = [ - "direction_id", - "time_period", - "route_combined_name", - "organization_name", - "service_date", - color_col, - y_col, - ] - - if len(df) == 0: - text_chart = create_data_unavailable_chart() - return text_chart - else: - df = clean_data_charts(df,y_col) - chart = ( - alt.Chart(df) - .mark_bar(size=10) - .encode( - x=alt.X( - "yearmonthdate(service_date):O", - title=["Grouped by Direction ID", "Date"], - axis=alt.Axis(labelAngle=-45, format="%b %Y"), - ), - y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)), - xOffset=alt.X(f"{offset_col}:N", title=_report_utils.labeling(offset_col)), - color=alt.Color( - f"{color_col}:N", - title=_report_utils.labeling(color_col), - scale=alt.Scale( - range=_report_utils.red_green_yellow, - ), - ), - tooltip=tooltip_cols, - ) - ) - chart = (chart).properties( - title={ - "text": [title], - "subtitle": [subtitle], - }, - width=500, - height=300, - ) - - return chart - -def base_facet_line( - df: pd.DataFrame, y_col: str, title: str, subtitle: str -) -> alt.Chart: - if len(df) == 0: - text_chart = create_data_unavailable_chart() - return text_chart - else: - selection = alt.selection_point(fields=['time_period'], bind='legend') - - df = clean_data_charts(df,y_col) - tooltip_cols = [ - "route_combined_name", - "route_id", - "direction_id", - "time_period", - f"{y_col}_str", - ] - if "pct" in y_col: - max_y = 100 - elif "per_minute" in y_col: - max_y = round(df[y_col].max()) - else: - max_y = round(df[y_col].max(), -1) + 5 - chart = ( - alt.Chart(df) - .mark_line(size=5) - .encode( - x=alt.X( - "yearmonthdate(service_date):O", - title="Date", - axis=alt.Axis(labelAngle=-45, format="%b %Y"), - ), - y=alt.Y( - f"{y_col}:Q", - title=_report_utils.labeling(y_col), - scale=alt.Scale(domain=[0, max_y]), - ), - color=alt.Color( - "time_period:N", - title=_report_utils.labeling("time_period"), - scale=alt.Scale(range=_report_utils.red_green_yellow), - ), - - strokeWidth=alt.condition( - "datum.time_peak == 'All Day'", - alt.value(10), - alt.value(1)), - - tooltip=tooltip_cols, - ) - ) - - chart = chart.properties(width=250, height=300) - chart = chart.facet( - column=alt.Column("direction_id:N", title=_report_utils.labeling("direction_id")), - ).properties( - title={ - "text": [title], - "subtitle": [subtitle], - } - ).add_params(selection) - return chart -def base_facet_circle( - df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str -) -> alt.Chart: - - tooltip_cols = [ - "direction_id", - "time_period", - "route_combined_name", - "service_date", - f"{y_col}_str", - "variable", - ] - - if len(df) == 0: - text_chart = create_data_unavailable_chart() - return text_chart - else: - if "pct" in y_col: - max_y = 100 - elif "per_minute" in y_col: - max_y = round(df[y_col].max()) - else: - max_y = round(df[y_col].max(), -1) + 5 - df = clean_data_charts(df,y_col) - df = df.assign( - variable=df.variable.str.replace("_", " ").str.title(), - ).reset_index(drop=True) - ruler = ( - alt.Chart(df) - .mark_rule(color="red", strokeDash=[10, 7]) - .encode(y=f"ruler_100_pct:Q") - ) - - chart = ( - alt.Chart(df) - .mark_circle(size=100) - .encode( - x=alt.X( - "yearmonthdate(service_date):O", - title="Date", - axis=alt.Axis(labelAngle=-45, format="%b %Y"), - ), - y=alt.Y( - f"{y_col}:Q", - title=_report_utils.labeling(y_col), - scale=alt.Scale(domain=[0, max_y]), - ), - color=alt.Color( - "variable:N", - title=_report_utils.labeling("variable"), - scale=alt.Scale(range=_report_utils.red_green_yellow), - ), - tooltip=tooltip_cols, - ) - ) - - chart = chart + ruler - chart = chart.facet( - column=alt.Column("direction_id:N", title=_report_utils.labeling("direction_id")), - ).properties( - title={ - "text": [title], - "subtitle": [subtitle], - } - ) - return chart -def base_facet_chart( - df: pd.DataFrame, - y_col: str, - color_col: str, - facet_col: str, - title: str, - subtitle: str, -): - tooltip_cols = [ - "direction_id", - "time_period", - "route_combined_name", - "organization_name", - "service_date", - y_col, - color_col, - ] - - if len(df) == 0: - text_chart = create_data_unavailable_chart() - return text_chart - else: - if "pct" in y_col: - max_y = 100 - elif "per_minute" in y_col: - max_y = round(df[y_col].max()) - else: - max_y = round(df[y_col].max(), -1) + 5 - df = clean_data_charts(df,y_col) - chart = ( - ( - alt.Chart(df) - .mark_bar(size=15, clip=True) - .encode( - x=alt.X( - "yearmonthdate(service_date):O", - title=["Service Date"], - axis=alt.Axis(labelAngle=-45, format="%b %Y"), - ), - y=alt.Y( - f"{y_col}:Q", - title=_report_utils.labeling(y_col), - scale=alt.Scale(domain=[0, max_y]), - ), - color=alt.Color( - f"{color_col}:N", - title=_report_utils.labeling(color_col), - scale=alt.Scale(range=_report_utils.red_green_yellow), - ), - tooltip=tooltip_cols, - ) - ) - .facet( - column=alt.Column( - f"{facet_col}:N", - ) - ) - .properties( - title={ - "text": title, - "subtitle": subtitle, - } - ) - ) - return chart - -def base_facet_with_ruler_chart( - df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str -): - tooltip_cols = [ - "direction_id", - "time_period", - "route_combined_name", - "organization_name", - "service_date", - y_col, - ] - - if len(df) == 0: - text_chart = create_data_unavailable_chart() - return text_chart - else: - df = clean_data_charts(df,y_col) - if "pct" in y_col: - max_y = 100 - elif "per_minute" in y_col: - max_y = round(df[y_col].max()) + 2 - else: - max_y = round(df[y_col].max(), -1) + 5 - ruler = ( - alt.Chart(df) - .mark_rule(color="red", strokeDash=[10, 7]) - .encode(y=f"mean({ruler_col}):Q") - ) - chart = ( - alt.Chart(df) - .mark_bar(size=15, clip=True) - .encode( - x=alt.X( - "yearmonthdate(service_date):O", - title=["Service Date"], - axis=alt.Axis(labelAngle=-45, format="%b %Y"), - ), - y=alt.Y( - f"{y_col}:Q", - title=_report_utils.labeling(y_col), - scale=alt.Scale(domain=[0, max_y]), - ), - color=alt.Color( - f"{y_col}:Q", - title=_report_utils.labeling(y_col), - scale=alt.Scale(range=_report_utils.red_green_yellow), - ), - tooltip=df[tooltip_cols].columns.tolist(), - ) - ) - - chart = chart + ruler - chart = chart.facet(column=alt.Column("direction_id:N",)).properties( - title={ - "text": title, - "subtitle": [subtitle], - } - ) - - return chart - -def create_text_table(df: pd.DataFrame, direction_id: str): - - df = ( - df.loc[df.direction_id == direction_id].drop_duplicates().reset_index(drop=True) - ) - - if len(df) == 0: - text_chart = create_data_unavailable_chart() - return text_chart - - else: - df2 = df.melt( - id_vars=[ - "route_combined_name", - "direction_id", - ], - value_vars=[ - "avg_scheduled_service_minutes", - "avg_stop_miles", - "n_scheduled_trips", - "sched_rt_category", - "peak_avg_speed", - "peak_scheduled_trips", - "peak_hourly_freq", - "offpeak_avg_speed", - "offpeak_scheduled_trips", - "offpeak_hourly_freq", - ], - ) - # Create a decoy column to center all the text - df2["Zero"] = 0 - - df2.variable = df2.variable.str.replace("_", " ").str.title() - df2 = df2.sort_values(by=["direction_id"]).reset_index(drop=True) - df2["combo_col"] = df2.variable.astype(str) + ": " + df2.value.astype(str) - text_chart = ( - alt.Chart(df2) - .mark_text() - .encode(x=alt.X("Zero:Q", axis=None), y=alt.Y("combo_col", axis=None)) - ) - - text_chart = text_chart.encode(text="combo_col:N").properties( - title=f"Route Statistics for Direction {direction_id}", - width=500, - height=300, - ) - return text_chart - -def frequency_chart(df: pd.DataFrame): - if len(df) == 0: - text_chart = create_data_unavailable_chart() - return text_chart - - else: - chart = ( - alt.Chart(df, width=180, height=alt.Step(10)) - .mark_bar() - .encode( - alt.Y( - "yearmonthdate(service_date):O", - title="Date", - axis=alt.Axis(format="%b %Y"), - ), - alt.X("frequency:Q", title=_report_utils.labeling("frequency"), axis=None), - alt.Color("frequency", scale=alt.Scale(range=_report_utils.red_green_yellow)).title( - _report_utils.labeling("Frequency") - ), - alt.Row("time_period:N") - .title(_report_utils.labeling("time_period")) - .header(labelAngle=0), - alt.Column("direction_id:N").title(_report_utils.labeling("direction_id")), - ) - ) - - chart = chart.properties(title="Frequency of Trips per Hour") - return chart -""" -Route-Direction -Section -""" -def filtered_route( - df: pd.DataFrame, -) -> alt.Chart: - """ - https://stackoverflow.com/questions/58919888/multiple-selections-in-altair - """ - # Filter for only schedule and vp - df_sched_vp_both = df[df.sched_rt_category == "schedule_and_vp"].reset_index( - drop=True - ) - routes_list = df_sched_vp_both["route_combined_name"].unique().tolist() - - - route_dropdown = alt.binding_select( - options=routes_list, - name="Routes", - ) - - # Column that controls the bar charts - route_selector = alt.selection_point( - fields=["route_combined_name"], - bind=route_dropdown, - ) - - # Data - # Filter for only rows categorized as found in schedule and vp and all_day - all_day = df_sched_vp_both.loc[ - df_sched_vp_both.time_period == "all_day" - ].reset_index(drop=True) - - # Create route stats table for the text tables - route_stats_df = route_stats(df) - - # Manipulate the df for some of the metrics - timeliness_df = timeliness_trips(df_sched_vp_both) - rt_journey_vp = pct_vp_journey( - all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp" - ) - sched_journey_vp = pct_vp_journey( - all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp" - ) - - # Charts - avg_scheduled_min = ( - grouped_bar_chart( - df=all_day.drop_duplicates(), - color_col="direction_id", - y_col="avg_scheduled_service_minutes", - offset_col="direction_id", - title="Average Scheduled Minutes", - subtitle="The average minutes a trip is scheduled to run.", - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - - timeliness_trips_dir_0 = ( - ( - base_facet_chart( - timeliness_df.loc[timeliness_df.direction_id == 0].drop_duplicates(), - "value", - "variable", - "time_period", - "Breakdown of Trips by Categories for Direction 0", - "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.", - ) - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - timeliness_trips_dir_1 = ( - ( - base_facet_chart( - timeliness_df.loc[timeliness_df.direction_id == 1].drop_duplicates(), - "value", - "variable", - "time_period", - "Breakdown of Trips by Categories for Direction 1", - "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.", - ) - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - - frequency = ( - frequency_chart(df_sched_vp_both) - .add_params(route_selector) - .transform_filter(route_selector) - ) - speed = ( - base_facet_line( - df_sched_vp_both, - "speed_mph", - "Average Speed", - "The average miles per hour the bus travels by direction and time of day.", - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - - vp_per_min = ( - ( - base_facet_with_ruler_chart( - all_day.drop_duplicates(), - "vp_per_minute", - "ruler_for_vp_per_min", - "Vehicle Positions per Minute", - "Trips should have 2+ vehicle positions per minute.", - ) - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - - rt_vp_per_min = ( - base_facet_circle( - rt_journey_vp, - "value", - "ruler_100_pct", - "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions", - "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.", - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - sched_vp_per_min = ( - base_facet_circle( - sched_journey_vp, - "value", - "sched_journey_vp", - "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions", - "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.", - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - spatial_accuracy = ( - base_facet_with_ruler_chart( - all_day.drop_duplicates(), - "pct_in_shape", - "ruler_100_pct", - "Spatial Accuracy", - "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.", - ) - .add_params(route_selector) - .transform_filter(route_selector) - ) - - text_dir0 = ( - (create_text_table(route_stats_df, 0)) - .add_params(route_selector) - .transform_filter(route_selector) - ) - text_dir1 = ( - create_text_table(route_stats_df, 1) - .add_params(route_selector) - .transform_filter(route_selector) - ) - chart_list = [ - avg_scheduled_min, - timeliness_trips_dir_0, - timeliness_trips_dir_1, - frequency, - speed, - vp_per_min, - rt_vp_per_min, - sched_vp_per_min, - spatial_accuracy, - text_dir0, - text_dir1, - ] - - chart = alt.vconcat(*chart_list).properties( - resolve=alt.Resolve( - scale=alt.LegendResolveMap(color=alt.ResolveMode("independent")) - ) - ) - return chart - -### Section 1 -def summarize_monthly(df:pd.DataFrame)->pd.DataFrame: - df2 = ( - df.groupby( - ['name', 'month','time_of_day', 'day_name'] - ) - .agg( - { - "ttl_service_hours": "sum", - } - ) - .reset_index() - ) - - return df2 - -def convert_to_timestamps(datetime_list): - timestamps = [] - for dt in datetime_list: - timestamp = dt.astype("datetime64[s]").astype(datetime) - timestamps.append(timestamp) - return timestamps - -def count_days_in_months(dates: list) -> pd.DataFrame: - # Turn list from numpy datetime to timestamp - dates2 = convert_to_timestamps(dates) - # Initialize a dictionary to store counts for each day of the week - day_counts = {} - - # Iterate over each date - for date in dates2: - year = date.year - month = date.month - - # Initialize counts dictionary for the current month-year combination - if (year, month) not in day_counts: - day_counts[(year, month)] = { - "Monday": 0, - "Tuesday": 0, - "Wednesday": 0, - "Thursday": 0, - "Friday": 0, - "Saturday": 0, - "Sunday": 0, - } - - # Get the calendar matrix for the current month and year - matrix = calendar.monthcalendar(year, month) - - # Iterate over each day in the matrix - for week in matrix: - for i, day in enumerate(week): - # Increment the count for the corresponding day of the week - if day != 0: - weekday = calendar.day_name[i] - day_counts[(year, month)][weekday] += 1 - - # Convert the dictionary to a pandas DataFrame - df = pd.DataFrame.from_dict(day_counts, orient="index") - df = df.reset_index() - df["level_1"] = df["level_1"].astype(str).str.zfill(2) - df["month"] = df.level_0.astype(str) + "-" + df.level_1.astype(str) - df = df.drop(columns=["level_0", "level_1"]) - - # Melt from wide to long - df2 = pd.melt( - df, - id_vars=["month"], - value_vars=[ - "Monday", - "Tuesday", - "Wednesday", - "Thursday", - "Friday", - "Saturday", - "Sunday", - "month", - ]) - - df2 = df2.rename(columns = {"variable":"day_name", "value":"n_days"}) - return df2 - -def total_monthly_service(name:str) ->pd.DataFrame: - - df = load_scheduled_service(name) - - # Grab unique dates - unique_dates = list(df.datetime_date.unique()) - - # Find number of Monday's, Tuesday's...etc in each date - month_days_df = count_days_in_months(unique_dates) - - # Aggregate the original dataframe - agg_df = summarize_monthly(df) - - # Merge on number of day types - agg_df = pd.merge(agg_df, month_days_df, on =["month", "day_name"], how = "left") - - # Find daily service hours - agg_df["Daily Service Hours"] = agg_df.ttl_service_hours / agg_df.n_days - - # Rename columns - agg_df.columns = agg_df.columns.map(_report_utils.replace_column_names) - - return agg_df - -def single_bar_chart_dropdown( - df: pd.DataFrame, - x_col: str, - y_col: str, - offset_col: str, - title: str, - dropdown_col: str, - subtitle:str -): - dropdown_list = df[dropdown_col].unique().tolist() - dropdown_list.sort(reverse=True) - dropdown = alt.binding_select(options=dropdown_list, name=_report_utils.labeling(dropdown_col)) - - selector = alt.selection_point( - name=_report_utils.labeling(dropdown_col), fields=[dropdown_col], bind=dropdown - ) - - chart = ( - alt.Chart(df) - .mark_bar() - .encode( - x=alt.X( - f"{x_col}:N", - title="Day", - scale=alt.Scale( - domain=[ - "Monday", - "Tuesday", - "Wednesday", - "Thursday", - "Friday", - "Saturday", - "Sunday", - ] - ), - ), - y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)), - xOffset=f"{offset_col}:N", - color=alt.Color( - f"{offset_col}:N", - title=_report_utils.labeling(offset_col), - scale=alt.Scale( - range=color_dict["full_color_scale"], - ), - ), - tooltip=df.columns.tolist(), - ) - ) - chart = chart.properties( - title = { - "text": [title], - "subtitle": [subtitle], - }, width=400, height=250) - chart = chart.add_params(selector).transform_filter(selector) - - display(chart) - \ No newline at end of file diff --git a/gtfs_digest/_section1_utils.py b/gtfs_digest/_section1_utils.py index 371b5e4cc..193bba0a6 100644 --- a/gtfs_digest/_section1_utils.py +++ b/gtfs_digest/_section1_utils.py @@ -105,6 +105,19 @@ def load_operator_service_hours(name:str)->pd.DataFrame: df.columns = df.columns.map(_report_utils.replace_column_names) return df +def load_operator_metrics(name:str)->pd.DataFrame: + """ + Load dataframe with the total scheduled service hours + a transit operator. + """ + url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet" + + df = pd.read_parquet(url, + filters=[[(("name", "==", name))]]) + + # Rename dataframe + df.columns = df.columns.map(_report_utils.replace_column_names) + return df """ Data Manipulation Change dataframes from long to wide diff --git a/gtfs_digest/_section2_utils.py b/gtfs_digest/_section2_utils.py index 510b83beb..0ad947a35 100644 --- a/gtfs_digest/_section2_utils.py +++ b/gtfs_digest/_section2_utils.py @@ -67,6 +67,24 @@ def load_schedule_vp_metrics(organization:str)->pd.DataFrame: return df +def load_operator_metrics(organization_name:str)->pd.DataFrame: + """ + Load dataframe with the total scheduled service hours + a transit operator. + """ + url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_metrics}.parquet" + + df = pd.read_parquet(url, + filters=[[(("organization_name", "==", organization_name))]]) + + # Rename dataframe + df.columns = df.columns.map(_report_utils.replace_column_names) + + df["ruler_100_pct"] = 100 + + df["ruler_for_vp_per_min"] = 2 + return df + """ Data Manipulation """ @@ -199,41 +217,6 @@ def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame: ) return df3 -def aggregate_by_agency(df: pd.DataFrame) -> pd.DataFrame: - """ - Aggregate some of the metrics for all the routes - across the agency. - """ - # Filter to all day to avoid double counting - df = df.loc[df["Period"] == "all_day"].reset_index(drop=True) - - # Aggregate by totals by date - agg1 = ( - df.groupby(["Date"]) - .agg( - { - "# VP": "sum", - "# VP within Scheduled Shape": "sum", - "Aggregate Actual Service Minutes": "sum", - "ruler_100_pct":"max", - "ruler_for_vp_per_min":"max" - } - ) - .reset_index() - ) - - # Find metrics - agg1["VP per Minute (All Routes)"] = ( - (agg1["# VP"] / agg1[ "Aggregate Actual Service Minutes"]) - ).round(2) - agg1["Spatial Accuracy (All Routes)"] = (( - agg1["# VP within Scheduled Shape"] / agg1["# VP"] - ) * 100).round(2) - - # Sort the data - agg1 = agg1.sort_values(by=["Date"]).reset_index(drop=True) - - return agg1 """ Charts """ @@ -769,11 +752,9 @@ def simple_bar_chart( Agency Metrics Overview Section """ def agency_overview(df:pd.DataFrame)->alt.Chart: - agg1 = aggregate_by_agency(df) - # display(agg1.head()) agency_spatial_chart = ( simple_bar_chart( - agg1, + df, "Spatial Accuracy (All Routes)", "ruler_100_pct", readable_dict["agency_spatial_accuracy"]["title"], @@ -786,7 +767,7 @@ def agency_overview(df:pd.DataFrame)->alt.Chart: agency_vp_chart = ( ( simple_bar_chart( - agg1, + df, "VP per Minute (All Routes)", "ruler_for_vp_per_min", readable_dict["agency_vp_per_min_graph"]["title"], diff --git a/gtfs_digest/merge_operator_data.py b/gtfs_digest/merge_operator_data.py index 3a489cd52..cd4cc271b 100644 --- a/gtfs_digest/merge_operator_data.py +++ b/gtfs_digest/merge_operator_data.py @@ -101,6 +101,24 @@ def operator_category_counts_by_date() -> pd.DataFrame: return operator_category_counts +def concatenate_operator_level_metrics( + date_list: list +) -> pd.DataFrame: + """ + Get spatial accuracy and VP per Minute metrics on the + operator-service_date grain. + """ + FILE = f"{GTFS_DATA_DICT.rt_vs_schedule_tables.vp_operator_metrics}" + + df = time_series_utils.concatenate_datasets_across_dates( + RT_SCHED_GCS, + FILE, + date_list, + data_type = "df", + ).sort_values(sort_cols).reset_index(drop=True) + + return df + if __name__ == "__main__": @@ -110,11 +128,18 @@ def operator_category_counts_by_date() -> pd.DataFrame: OPERATOR_PROFILE = GTFS_DATA_DICT.digest_tables.operator_profiles OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map + OPERATOR_METRICS = GTFS_DATA_DICT.digest_tables.operator_metrics SCHED_RT_CATEGORY = GTFS_DATA_DICT.digest_tables.operator_sched_rt CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys() + # Concat operator metrics. + operator_metrics = concatenate_operator_level_metrics(analysis_date_list) + operator_metrics.to_parquet( + f"{RT_SCHED_GCS}{OPERATOR_METRICS}.parquet" + ) + # Concat operator profiles df = concatenate_operator_stats(analysis_date_list) @@ -192,4 +217,5 @@ def operator_category_counts_by_date() -> pd.DataFrame: operator_category_counts.to_parquet( f"{RT_SCHED_GCS}{SCHED_RT_CATEGORY}.parquet" ) + \ No newline at end of file diff --git a/gtfs_digest/readable.yml b/gtfs_digest/readable.yml index 991561a89..57e4f9569 100644 --- a/gtfs_digest/readable.yml +++ b/gtfs_digest/readable.yml @@ -56,6 +56,8 @@ operator_n_trips: "# Trips" operator_route_length_miles: "Operator Service Miles" organization_name: Organization organization_source_record_id: "Organization ID" +vp_per_min_agency: "VP per Minute (All Routes)" +spatial_accuracy_agency: "Spatial Accuracy (All Routes)" # Dates / time time_period: Period diff --git a/rt_scheduled_v_ran/11_agency_agg.ipynb b/rt_scheduled_v_ran/11_agency_agg.ipynb index 124f42ae8..fa30bfb9a 100644 --- a/rt_scheduled_v_ran/11_agency_agg.ipynb +++ b/rt_scheduled_v_ran/11_agency_agg.ipynb @@ -55,33 +55,13 @@ "id": "baf7ddd3-fd43-458a-9479-71bc9f7935db", "metadata": {}, "source": [ - "### Exploring" + "### Exploring\n", + "* Need to filter for only (\"sched_rt_category\", \"==\", \"schedule_and_vp\") to get the same results as the one on analysis.calitp.org." ] }, { "cell_type": "code", "execution_count": 3, - "id": "00b45e96-315f-4f74-af8c-74eb994057ab", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "RT_SCHED_GCS" - ] - }, - { - "cell_type": "code", - "execution_count": 4, "id": "78b859a7-7598-4719-a806-887b31a5daa9", "metadata": {}, "outputs": [], @@ -91,174 +71,119 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "a666f731-821a-4d7f-adc6-36ab7ee1428c", - "metadata": {}, - "outputs": [], - "source": [ - "ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2d7af748-ceae-4a95-88e6-e24eb788a253", + "execution_count": 4, + "id": "b9cdee67-aafe-4e6a-adb9-5210e49bd82f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'vp_route_dir/route_direction_metrics'" + "['2024-01-17',\n", + " '2024-02-14',\n", + " '2024-03-13',\n", + " '2024-04-17',\n", + " '2024-05-22',\n", + " '2024-06-12',\n", + " '2024-07-17',\n", + " '2024-08-14',\n", + " '2024-09-18']" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ROUTE_EXPORT" + "rt_dates.y2024_dates" ] }, { "cell_type": "code", - "execution_count": 33, - "id": "7d685df6-f33c-430b-a878-22f7ce894aa2", + "execution_count": 5, + "id": "00b45e96-315f-4f74-af8c-74eb994057ab", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'vp_agency/agency_metrics'" + "'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'" ] }, - "execution_count": 33, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "GTFS_DATA_DICT.rt_vs_schedule_tables.vp_agency_metrics" + "RT_SCHED_GCS" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "386cab22-a872-4c9b-8eb4-970adede9c90", + "execution_count": 6, + "id": "a666f731-821a-4d7f-adc6-36ab7ee1428c", "metadata": {}, "outputs": [], "source": [ - "analysis_date = rt_dates.DATES[\"apr2024\"]" + "ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "d07f3469-8630-41e5-a85f-dfc6e8dd544d", + "execution_count": 7, + "id": "2d7af748-ceae-4a95-88e6-e24eb788a253", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'may2022': '2022-05-04',\n", - " 'sep2022': '2022-09-14',\n", - " 'sep2022a': '2022-09-21',\n", - " 'oct2022': '2022-10-12',\n", - " 'nov2022a': '2022-11-07',\n", - " 'nov2022b': '2022-11-08',\n", - " 'nov2022c': '2022-11-09',\n", - " 'nov2022d': '2022-11-10',\n", - " 'nov2022': '2022-11-16',\n", - " 'mar2023': '2023-03-15',\n", - " 'may2023': '2023-05-17',\n", - " 'sep2023': '2023-09-13',\n", - " 'oct2023a': '2023-10-09',\n", - " 'oct2023b': '2023-10-10',\n", - " 'oct2023': '2023-10-11',\n", - " 'oct2023c': '2023-10-12',\n", - " 'oct2023d': '2023-10-13',\n", - " 'oct2023e': '2023-10-14',\n", - " 'oct2023f': '2023-10-15',\n", - " 'nov2023': '2023-11-15',\n", - " 'mar2024': '2024-03-13',\n", - " 'may2024': '2024-05-22',\n", - " 'sep2024': '2024-09-18'}" + "'vp_route_dir/route_direction_metrics'" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "{k: v for k, v in rt_dates.DATES.items() if (k[:3], k[3:]) >= ('mar', '2023')}" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "bb39cfdf-43a3-46e4-b200-019eb08b2de3", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_parquet(f\"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "69d44f05-44b3-41f1-8d69-c5ddc7fb3dcd", - "metadata": {}, - "outputs": [], - "source": [ - "df = df.loc[df[\"time_period\"] == \"all_day\"].reset_index(drop=True)" + "ROUTE_EXPORT" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "9da30f17-f428-4dad-a889-e86b2ce605f8", + "execution_count": 8, + "id": "386cab22-a872-4c9b-8eb4-970adede9c90", "metadata": {}, "outputs": [], "source": [ - "groupby_cols = [\"caltrans_district\", \"organization_name\", \"schedule_gtfs_dataset_key\"]" + "analysis_date = \"2024-09-18\"" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "0d8b94cc-f6ee-4e91-b8c2-3f98048e81bf", + "execution_count": 9, + "id": "bb39cfdf-43a3-46e4-b200-019eb08b2de3", "metadata": {}, "outputs": [], "source": [ - "agg1 = (\n", - " df.groupby(groupby_cols)\n", - " .agg(\n", - " {\n", - " \"total_vp\": \"sum\",\n", - " \"vp_in_shape\": \"sum\",\n", - " \"total_rt_service_minutes\": \"sum\",\n", - " }\n", - " )\n", - " .reset_index()\n", - ")" + "df = pd.read_parquet(f\"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet\")" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "45a50735-5253-4434-99d2-2feb28431bd4", + "execution_count": 10, + "id": "69d44f05-44b3-41f1-8d69-c5ddc7fb3dcd", "metadata": {}, "outputs": [], "source": [ - "agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(2)\n", - "agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)" + "df = df.loc[df[\"time_period\"] == \"all_day\"].reset_index(drop=True)" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "c886f721-ae63-4b23-8100-40fafc3587d4", + "execution_count": 11, + "id": "48a80998-a13b-4580-9778-ed67958a8e78", "metadata": {}, "outputs": [ { @@ -282,266 +207,237 @@ " \n", " \n", " \n", - " 3\n", + " 2275\n", " \n", " \n", " \n", " \n", - " caltrans_district\n", - " 01 - Eureka\n", + " schedule_gtfs_dataset_key\n", + " cc53a0dbf5df90e3009b9cb5d89d80ba\n", " \n", " \n", - " organization_name\n", - " Redwood Coast Transit Authority\n", + " route_id\n", + " 4867\n", " \n", " \n", - " schedule_gtfs_dataset_key\n", - " 090b30e4249a7ec2b4c6a0923ed2f953\n", + " direction_id\n", + " 0.00\n", + " \n", + " \n", + " time_period\n", + " all_day\n", + " \n", + " \n", + " minutes_atleast1_vp\n", + " 1629\n", + " \n", + " \n", + " minutes_atleast2_vp\n", + " 1578\n", + " \n", + " \n", + " total_rt_service_minutes\n", + " 1627.50\n", + " \n", + " \n", + " total_scheduled_service_minutes\n", + " 1221.00\n", " \n", " \n", " total_vp\n", - " 7047\n", + " 4613\n", " \n", " \n", " vp_in_shape\n", - " 4746\n", + " 3491\n", " \n", " \n", - " total_rt_service_minutes\n", - " 2480.40\n", + " is_early\n", + " 1\n", " \n", " \n", - " vp_per_min_agency\n", - " 2.84\n", + " is_ontime\n", + " 5\n", " \n", " \n", - " spatial_accuracy_agency\n", - " 67.35\n", + " is_late\n", + " 31\n", " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " 3\n", - "caltrans_district 01 - Eureka\n", - "organization_name Redwood Coast Transit Authority\n", - "schedule_gtfs_dataset_key 090b30e4249a7ec2b4c6a0923ed2f953\n", - "total_vp 7047\n", - "vp_in_shape 4746\n", - "total_rt_service_minutes 2480.40\n", - "vp_per_min_agency 2.84\n", - "spatial_accuracy_agency 67.35" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agg1.sample().T" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "3985f03a-bcd5-41f9-bcff-a5f3a1436603", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
caltrans_district
n_vp_trips37
vp_per_minute2.83
pct_in_shape0.76
pct_rt_journey_atleast1_vp1.00
pct_rt_journey_atleast2_vp0.97
pct_sched_journey_atleast1_vp1.00
pct_sched_journey_atleast2_vp1.00
rt_sched_journey_ratio1.33
avg_rt_service_minutes43.99
nameLA DOT Schedule
schedule_source_record_idrec4C3jVlVMVmxiNr
base64_urlaHR0cHM6Ly9sYWRvdGJ1cy5jb20vZ3Rmcw==
organization_source_record_idrec4pgjrmdhCh4z01
organization_nameschedule_gtfs_dataset_keytotal_vpvp_in_shapetotal_rt_service_minutesvp_per_min_agencyspatial_accuracy_agencyCity of Los Angeles
001 - EurekaCity of Eurekaa253a8d7acd57657bb98050f37dd6b0f379811800013102.612.9047.39
101 - EurekaLake Transit Authority0a3c0b21c85fb09f8db91599e14dd7f713320127725433.322.4595.89caltrans_district07 - Los Angeles
\n", "
" ], "text/plain": [ - " caltrans_district organization_name schedule_gtfs_dataset_key \\\n", - "0 01 - Eureka City of Eureka a253a8d7acd57657bb98050f37dd6b0f \n", - "1 01 - Eureka Lake Transit Authority 0a3c0b21c85fb09f8db91599e14dd7f7 \n", - "\n", - " total_vp vp_in_shape total_rt_service_minutes vp_per_min_agency \\\n", - "0 37981 18000 13102.61 2.90 \n", - "1 13320 12772 5433.32 2.45 \n", - "\n", - " spatial_accuracy_agency \n", - "0 47.39 \n", - "1 95.89 " + " 2275\n", + "schedule_gtfs_dataset_key cc53a0dbf5df90e3009b9cb5d89d80ba\n", + "route_id 4867\n", + "direction_id 0.00\n", + "time_period all_day\n", + "minutes_atleast1_vp 1629\n", + "minutes_atleast2_vp 1578\n", + "total_rt_service_minutes 1627.50\n", + "total_scheduled_service_minutes 1221.00\n", + "total_vp 4613\n", + "vp_in_shape 3491\n", + "is_early 1\n", + "is_ontime 5\n", + "is_late 31\n", + "n_vp_trips 37\n", + "vp_per_minute 2.83\n", + "pct_in_shape 0.76\n", + "pct_rt_journey_atleast1_vp 1.00\n", + "pct_rt_journey_atleast2_vp 0.97\n", + "pct_sched_journey_atleast1_vp 1.00\n", + "pct_sched_journey_atleast2_vp 1.00\n", + "rt_sched_journey_ratio 1.33\n", + "avg_rt_service_minutes 43.99\n", + "name LA DOT Schedule\n", + "schedule_source_record_id rec4C3jVlVMVmxiNr\n", + "base64_url aHR0cHM6Ly9sYWRvdGJ1cy5jb20vZ3Rmcw==\n", + "organization_source_record_id rec4pgjrmdhCh4z01\n", + "organization_name City of Los Angeles\n", + "caltrans_district 07 - Los Angeles" ] }, - "execution_count": 16, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "agg1.head(2)" - ] - }, - { - "cell_type": "markdown", - "id": "b1d4b72a-e09d-41f0-bc13-c78e65bad8b0", - "metadata": {}, - "source": [ - "### Functions " + "df.sample().T" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "02a30975-d9d5-4174-8a5f-47c1e80970df", - "metadata": {}, - "outputs": [], - "source": [ - "def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:\n", - " # start = datetime.datetime.now()\n", - "\n", - " ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics\n", - " AGENCY_EXPORT = dict_inputs.vp_agency_metrics\n", - "\n", - " # Read in dataframe.\n", - " df = pd.read_parquet(f\"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet\")\n", - "\n", - " # Keep only all_day.\n", - " df = df.loc[df[\"time_period\"] == \"all_day\"].reset_index(drop=True)\n", - "\n", - " # Aggregate\n", - " groupby_cols = [\n", - " \"caltrans_district\",\n", - " \"organization_name\",\n", - " \"schedule_gtfs_dataset_key\",\n", - " ]\n", - "\n", - " sum_cols = [\"total_vp\", \"vp_in_shape\", \"total_rt_service_minutes\"]\n", - " agg1 = df.groupby(groupby_cols).agg({**{e: \"sum\" for e in sum_cols}}).reset_index()\n", - "\n", - " agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(\n", - " 2\n", - " )\n", - " agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(\n", - " 2\n", - " )\n", - "\n", - " agg1 = agg1.drop(columns=sum_cols)\n", - " # Save\n", - " agg1.to_parquet(f\"{RT_SCHED_GCS}{AGENCY_EXPORT}_TEST_{analysis_date}.parquet\")\n", - "\n", - " # end = datetime.datetime.now()\n", - " # logger.info(f\"agency aggregation {analysis_date}: {end - start}\")\n", - "\n", - " return agg1" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "7696d284-c8cb-4739-8131-dc873933994e", + "execution_count": 12, + "id": "9da30f17-f428-4dad-a889-e86b2ce605f8", "metadata": {}, "outputs": [], "source": [ - "analysis_date2 = rt_dates.DATES[\"apr2024\"]" + "groupby_cols = [\n", + " \"caltrans_district\",\n", + " \"organization_name\",\n", + " \"schedule_gtfs_dataset_key\",\n", + "]" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "815a5ad4-2422-44ed-86c2-2bd5c4eae693", + "execution_count": 13, + "id": "0d8b94cc-f6ee-4e91-b8c2-3f98048e81bf", "metadata": {}, "outputs": [], "source": [ - "dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables" + "agg1 = (\n", + " df.groupby(groupby_cols)\n", + " .agg(\n", + " {\n", + " \"total_vp\": \"sum\",\n", + " \"vp_in_shape\": \"sum\",\n", + " \"total_rt_service_minutes\": \"sum\",\n", + " }\n", + " )\n", + " .reset_index()\n", + ")" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "85a4d3ea-cf58-44e7-901f-59414416e092", + "execution_count": 14, + "id": "45a50735-5253-4434-99d2-2feb28431bd4", "metadata": {}, "outputs": [], "source": [ - "apr_df = agency_metrics(\n", - " analysis_date2,\n", - " dict_inputs,\n", - ")" + "agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(2)\n", + "agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "0b62649f-15b6-459d-ab1d-b2627521abfe", + "execution_count": 15, + "id": "e7ff37d4-bb02-4ca1-af61-a71a01565322", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.8153321332404478" + "0.6517265362899927" ] }, - "execution_count": 21, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "703396 / 862711" + "63718 / 97768" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "82ce25f9-3088-4d10-8427-f0fdf4f8c05f", + "execution_count": 16, + "id": "c886f721-ae63-4b23-8100-40fafc3587d4", "metadata": {}, "outputs": [ { @@ -565,200 +461,139 @@ " \n", " \n", " \n", - " 12\n", - " \n", - " \n", - " \n", - " \n", " caltrans_district\n", - " 04 - Oakland\n", - " \n", - " \n", " organization_name\n", - " Alameda-Contra Costa Transit District\n", - " \n", - " \n", " schedule_gtfs_dataset_key\n", - " c499f905e33929a641f083dad55c521e\n", - " \n", - " \n", + " total_vp\n", + " vp_in_shape\n", + " total_rt_service_minutes\n", " vp_per_min_agency\n", - " 2.02\n", - " \n", - " \n", " spatial_accuracy_agency\n", - " 81.53\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " 12\n", - "caltrans_district 04 - Oakland\n", - "organization_name Alameda-Contra Costa Transit District\n", - "schedule_gtfs_dataset_key c499f905e33929a641f083dad55c521e\n", - "vp_per_min_agency 2.02\n", - "spatial_accuracy_agency 81.53" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "apr_df.loc[apr_df.organization_name == \"Alameda-Contra Costa Transit District\"].T" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "d006206a-48e0-49b4-ba68-cd279fd7f0dc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
42
caltrans_district05 - San Luis Obispo
organization_nameSanta Cruz Metropolitan Transit District
schedule_gtfs_dataset_key43d8d305ee692724a532f30ea63a1cbe
vp_per_min_agency1.52
spatial_accuracy_agency94.492304 - OaklandMarin County Transit District015d67d5b75b5cf2b710bbadadfb75f5977686371836831.132.6565.17
\n", "
" ], "text/plain": [ - " 42\n", - "caltrans_district 05 - San Luis Obispo\n", - "organization_name Santa Cruz Metropolitan Transit District\n", - "schedule_gtfs_dataset_key 43d8d305ee692724a532f30ea63a1cbe\n", - "vp_per_min_agency 1.52\n", - "spatial_accuracy_agency 94.49" + " caltrans_district organization_name \\\n", + "23 04 - Oakland Marin County Transit District \n", + "\n", + " schedule_gtfs_dataset_key total_vp vp_in_shape \\\n", + "23 015d67d5b75b5cf2b710bbadadfb75f5 97768 63718 \n", + "\n", + " total_rt_service_minutes vp_per_min_agency spatial_accuracy_agency \n", + "23 36831.13 2.65 65.17 " ] }, - "execution_count": 23, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "apr_df.sample().T" + "agg1.loc[agg1.organization_name == \"Marin County Transit District\"]" ] }, { "cell_type": "markdown", - "id": "b3e76f4c-c933-490f-89bf-01369797e5b0", + "id": "f5682445-84eb-485d-bb70-6651fc576ead", "metadata": {}, "source": [ - "### Look at the files" + "### Original" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "2038f4d6-1d3e-4331-b08f-aa2812a6b749", + "execution_count": 17, + "id": "f1cf75d6-5fc0-45c5-a223-6bbe86b79992", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "RT_SCHED_GCS" + "organization_name = \"Marin County Transit District\"" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "cf1aab5e-f375-44a2-bddb-78830f29f762", + "execution_count": 18, + "id": "7997ef5b-b7b9-4ef8-9dd0-a6a40ea28ba6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'vp_agency/agency_metrics'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "dict_inputs.vp_agency_metrics" + "schd_vp_url = f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet\"\n", + "\n", + "# Keep only rows that are found in both schedule and real time data\n", + "vp_sched_df = pd.read_parquet(schd_vp_url)" ] }, { "cell_type": "code", - "execution_count": 26, - "id": "1fea8b05-c690-49c8-82c9-39c3f2f17e98", + "execution_count": 19, + "id": "c597a945-a110-472e-b0d6-7bec6ef91370", "metadata": {}, "outputs": [], "source": [ - "sept_df = pd.read_parquet(\"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_agency/agency_metrics_TEST_2024-09-18.parquet\")" + "vp_sched_df = vp_sched_df.loc[vp_sched_df[\"time_period\"] == \"all_day\"].reset_index(\n", + " drop=True\n", + ")" ] }, { "cell_type": "code", - "execution_count": 29, - "id": "1699a6a0-223d-4667-b3b4-b7650387cb7f", + "execution_count": 20, + "id": "5f5a8bb7-ef34-46cf-9930-afa9ea2c664e", "metadata": {}, "outputs": [], "source": [ - "mar_df = pd.read_parquet(\"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_agency/agency_metrics_TEST_2024-03-13.parquet\")" + "schedule_and_vp_only = vp_sched_df.loc[vp_sched_df.sched_rt_category == \"schedule_and_vp\"]" ] }, { "cell_type": "code", - "execution_count": 34, - "id": "085868d8-ee1a-4849-b13a-27c90ac9f8ac", + "execution_count": 21, + "id": "7cb49efb-784a-482f-a739-f024ca3eb91f", "metadata": {}, + "outputs": [], + "source": [ + "vp_sched_df2 = (\n", + " vp_sched_df.groupby(\n", + " [\n", + " \"caltrans_district\",\n", + " \"organization_name\",\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"service_date\",\n", + " ]\n", + " )\n", + " .agg(\n", + " {\n", + " \"total_vp\": \"sum\",\n", + " \"total_rt_service_minutes\": \"sum\",\n", + " \"vp_in_shape\": \"sum\",\n", + " }\n", + " )\n", + " .reset_index()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c0538e01-a104-4f5b-aed0-8c023d23e060", + "metadata": { + "tags": [] + }, "outputs": [ { "data": { @@ -784,127 +619,85 @@ " caltrans_district\n", " organization_name\n", " schedule_gtfs_dataset_key\n", - " vp_per_min_agency\n", - " spatial_accuracy_agency\n", + " service_date\n", + " total_vp\n", + " total_rt_service_minutes\n", + " vp_in_shape\n", " \n", " \n", " \n", " \n", - " 0\n", - " 01 - Eureka\n", - " City of Eureka\n", - " a253a8d7acd57657bb98050f37dd6b0f\n", - " 2.90\n", - " 96.56\n", - " \n", - " \n", - " 1\n", - " 01 - Eureka\n", - " Lake Transit Authority\n", - " 0a3c0b21c85fb09f8db91599e14dd7f7\n", - " 2.44\n", - " 96.45\n", + " 956\n", + " 04 - Oakland\n", + " Marin County Transit District\n", + " 015d67d5b75b5cf2b710bbadadfb75f5\n", + " 2024-09-18\n", + " 97768\n", + " 36831.13\n", + " 63718\n", " \n", " \n", "\n", "" ], "text/plain": [ - " caltrans_district organization_name schedule_gtfs_dataset_key \\\n", - "0 01 - Eureka City of Eureka a253a8d7acd57657bb98050f37dd6b0f \n", - "1 01 - Eureka Lake Transit Authority 0a3c0b21c85fb09f8db91599e14dd7f7 \n", + " caltrans_district organization_name \\\n", + "956 04 - Oakland Marin County Transit District \n", "\n", - " vp_per_min_agency spatial_accuracy_agency \n", - "0 2.90 96.56 \n", - "1 2.44 96.45 " - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mar_df.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "ea934f21-c2e4-42dd-be5a-32b740008ba2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'vp_agency/agency_metrics_TEST_'" + " schedule_gtfs_dataset_key service_date total_vp \\\n", + "956 015d67d5b75b5cf2b710bbadadfb75f5 2024-09-18 97768 \n", + "\n", + " total_rt_service_minutes vp_in_shape \n", + "956 36831.13 63718 " ] }, - "execution_count": 38, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "f\"{GTFS_DATA_DICT.rt_vs_schedule_tables.vp_agency_metrics}_TEST_\"" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "f4881744-f5dc-4a7a-9203-88ff661741ca", - "metadata": {}, - "outputs": [], - "source": [ - "sort_cols = [\"schedule_gtfs_dataset_key\", \"service_date\"]" + "vp_sched_df2.loc[\n", + " (vp_sched_df2.organization_name == organization_name)\n", + " & (vp_sched_df2.service_date == \"2024-09-18\")\n", + "]" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "f76206d8-23b8-462f-8e61-9f839b12eeb7", - "metadata": {}, - "outputs": [], - "source": [ - "def concatenate_agency_level_metrics(\n", - " date_list: list\n", - ") -> pd.DataFrame:\n", - " FILE = f\"{GTFS_DATA_DICT.rt_vs_schedule_tables.vp_agency_metrics}_TEST\"\n", - " \n", - " df = time_series_utils.concatenate_datasets_across_dates(\n", - " RT_SCHED_GCS,\n", - " FILE,\n", - " date_list,\n", - " data_type = \"df\",\n", - " ).sort_values(sort_cols).reset_index(drop=True)\n", - " \n", - " return df\n" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "db75c693-2bd5-4631-9fe9-b0e302b89abf", + "execution_count": 23, + "id": "ac36b3db-447b-4c21-abe3-0cbb3f35ec68", "metadata": {}, "outputs": [], "source": [ - "analysis_date_list = rt_dates.y2024_dates" + "vp_sched_df3 = (\n", + " schedule_and_vp_only.groupby([\"caltrans_district\", \"organization_name\", \"service_date\"])\n", + " .agg(\n", + " {\n", + " \"total_vp\": \"sum\",\n", + " \"total_rt_service_minutes\": \"sum\",\n", + " \"vp_in_shape\": \"sum\",\n", + " }\n", + " )\n", + " .reset_index()\n", + ")" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "39f4dcc0-ead6-46e7-8e5a-4eebf5d03544", + "execution_count": 24, + "id": "dd41cecb-69f1-4ea6-90c8-6a6118b63c41", "metadata": {}, "outputs": [], "source": [ - "final_df = concatenate_agency_level_metrics(analysis_date_list)" + "vp_sched_df3[\"vp_per_min_agency\"] = ((vp_sched_df3.total_vp / vp_sched_df3.total_rt_service_minutes)).round(2)\n", + "vp_sched_df3[\"spatial_accuracy_agency\"] = ((vp_sched_df3.vp_in_shape / vp_sched_df3.total_vp) * 100).round(2)" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "5ade9cb8-893b-4e27-8151-9abc31ea60c2", + "execution_count": 25, + "id": "e32a1f49-5483-466c-a208-1db84a2d0e70", "metadata": {}, "outputs": [ { @@ -928,58 +721,89 @@ " \n", " \n", " \n", - " caltrans_district\n", - " organization_name\n", - " schedule_gtfs_dataset_key\n", - " vp_per_min_agency\n", - " spatial_accuracy_agency\n", - " service_date\n", + " 477\n", " \n", " \n", " \n", " \n", - " 0\n", + " caltrans_district\n", " 04 - Oakland\n", - " Marin County Transit District\n", - " 015d67d5b75b5cf2b710bbadadfb75f5\n", - " 2.60\n", - " 90.88\n", - " 2024-01-17\n", " \n", " \n", - " 1\n", - " 04 - Oakland\n", + " organization_name\n", " Marin County Transit District\n", - " 015d67d5b75b5cf2b710bbadadfb75f5\n", - " 2.68\n", - " 90.43\n", - " 2024-02-14\n", + " \n", + " \n", + " service_date\n", + " 2024-09-18 00:00:00\n", + " \n", + " \n", + " total_vp\n", + " 67420\n", + " \n", + " \n", + " total_rt_service_minutes\n", + " 25282.30\n", + " \n", + " \n", + " vp_in_shape\n", + " 61736\n", + " \n", + " \n", + " vp_per_min_agency\n", + " 2.67\n", + " \n", + " \n", + " spatial_accuracy_agency\n", + " 91.57\n", " \n", " \n", "\n", "" ], "text/plain": [ - " caltrans_district organization_name \\\n", - "0 04 - Oakland Marin County Transit District \n", - "1 04 - Oakland Marin County Transit District \n", - "\n", - " schedule_gtfs_dataset_key vp_per_min_agency \\\n", - "0 015d67d5b75b5cf2b710bbadadfb75f5 2.60 \n", - "1 015d67d5b75b5cf2b710bbadadfb75f5 2.68 \n", - "\n", - " spatial_accuracy_agency service_date \n", - "0 90.88 2024-01-17 \n", - "1 90.43 2024-02-14 " + " 477\n", + "caltrans_district 04 - Oakland\n", + "organization_name Marin County Transit District\n", + "service_date 2024-09-18 00:00:00\n", + "total_vp 67420\n", + "total_rt_service_minutes 25282.30\n", + "vp_in_shape 61736\n", + "vp_per_min_agency 2.67\n", + "spatial_accuracy_agency 91.57" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vp_sched_df3.loc[\n", + " (vp_sched_df3.organization_name == organization_name)\n", + " & (vp_sched_df3.service_date == \"2024-09-18\")\n", + "].T" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "08442ae2-cb81-493c-8ed8-839b70c26780", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6517265362899927" ] }, - "execution_count": 46, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "final_df.head(2)" + "63718 / 97768" ] } ], diff --git a/rt_scheduled_v_ran/12_agency_agg2.ipynb b/rt_scheduled_v_ran/12_agency_agg2.ipynb new file mode 100644 index 000000000..0e03a0da2 --- /dev/null +++ b/rt_scheduled_v_ran/12_agency_agg2.ipynb @@ -0,0 +1,634 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "257a475b-714f-4000-9f2e-0376a3951acd", + "metadata": {}, + "source": [ + "## Agency Grain Metrics\n", + "* Starting from `vp_trips` this time. \n", + "* Add it to the pipeline in `rt_scheduled_v_ran/scripts/rt_v_scheduled_agency.py`\n", + "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "343294c0-3ae5-440a-92fc-43d20408b701", + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import numpy as np\n", + "import pandas as pd\n", + "from segment_speed_utils import (\n", + " gtfs_schedule_wrangling,\n", + " helpers,\n", + " metrics,\n", + " time_series_utils,\n", + ")\n", + "from segment_speed_utils.project_vars import (\n", + " COMPILED_CACHED_VIEWS,\n", + " GTFS_DATA_DICT,\n", + " PROJECT_CRS,\n", + " RT_SCHED_GCS,\n", + " SCHED_GCS,\n", + " SEGMENT_GCS,\n", + ")\n", + "from shared_utils import catalog_utils, rt_dates, rt_utils" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "04beb077-3365-4290-a933-12a0ce750e53", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dbfba5e1-ad13-46bb-9921-74762549d9e5", + "metadata": {}, + "outputs": [], + "source": [ + "dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ee546f20-6c03-459c-ac6f-61738f66a895", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dir': '${gcs_paths.RT_SCHED_GCS}', 'stop_times_direction': 'stop_times_direction', 'sched_trip_metrics': 'schedule_trip/schedule_trip_metrics', 'sched_route_direction_metrics': 'schedule_route_dir/schedule_route_direction_metrics', 'vp_trip_metrics': 'vp_trip/trip_metrics', 'vp_route_direction_metrics': 'vp_route_dir/route_direction_metrics', 'vp_operator_metrics': 'vp_operator/operator_metrics', 'schedule_rt_stop_times': 'schedule_rt_stop_times', 'early_trip_minutes': -5, 'late_trip_minutes': 5}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0b3cfc74-2a10-434c-84ae-d723ad6396d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['2024-01-17',\n", + " '2024-02-14',\n", + " '2024-03-13',\n", + " '2024-04-17',\n", + " '2024-05-22',\n", + " '2024-06-12',\n", + " '2024-07-17',\n", + " '2024-08-14',\n", + " '2024-09-18']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rt_dates.y2024_dates" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8e7e816e-6a65-46a0-941d-e4231289e203", + "metadata": {}, + "outputs": [], + "source": [ + "TRIP_EXPORT = dict_inputs.vp_trip_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c3bf9359-318b-406c-b668-278c128e5292", + "metadata": {}, + "outputs": [], + "source": [ + "crosswalk_cols = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"name\",\n", + " \"organization_name\",\n", + " \"caltrans_district\",]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2a51515c-37d0-4c0a-8cb4-e265d1434755", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_date = \"2024-09-18\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8d2be580-0e06-43a6-8c34-ce8fd06ea2e9", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_parquet(f\"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c4c2e26b-9878-4566-98d8-1ff71237617e", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = gtfs_schedule_wrangling.merge_operator_identifiers(\n", + " df,\n", + " [analysis_date],\n", + " columns = crosswalk_cols)" + ] + }, + { + "cell_type": "markdown", + "id": "76a66216-0acc-4f89-b940-aa8d89bf476c", + "metadata": {}, + "source": [ + "### What time of day do I use?\n", + "`df.loc[df[\"time_period\"] == \"all_day\"]` is not available." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3eef5b7d-5b36-459c-b1d8-b2f953ab54e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['AM Peak', 'Evening', 'PM Peak', 'Early AM', 'Midday', 'Owl'],\n", + " dtype=object)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.time_of_day.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fdea269f-50d9-47b4-ba4c-eaae7c5fd06c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['peak', 'offpeak'], dtype=object)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.peak_offpeak.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8e00f7f5-49c7-4a29-90b0-fe924a40c01d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keytrip_instance_keyroute_iddirection_idscheduled_service_minutestotal_vprt_service_minutesminutes_atleast1_vpminutes_atleast2_vpvp_in_shapesched_rt_categorytime_of_daypeak_offpeakvp_per_minutepct_in_shapepct_rt_journey_atleast1_vppct_rt_journey_atleast2_vppct_sched_journey_atleast1_vppct_sched_journey_atleast2_vprt_sched_journey_differenceis_earlyis_ontimeis_latenameorganization_namecaltrans_district
07cc0cb1871dfd558f11a2885c145d144000213c9d5753f9565b679d8ab84929f21.0029.0020066.256866195schedule_and_vpAM Peakpeak3.020.971.001.001.001.0037.25001Bay Area 511 Muni ScheduleCity and County of San Francisco04 - Oakland
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key trip_instance_key \\\n", + "0 7cc0cb1871dfd558f11a2885c145d144 000213c9d5753f9565b679d8ab84929f \n", + "\n", + " route_id direction_id scheduled_service_minutes total_vp \\\n", + "0 2 1.00 29.00 200 \n", + "\n", + " rt_service_minutes minutes_atleast1_vp minutes_atleast2_vp vp_in_shape \\\n", + "0 66.25 68 66 195 \n", + "\n", + " sched_rt_category time_of_day peak_offpeak vp_per_minute pct_in_shape \\\n", + "0 schedule_and_vp AM Peak peak 3.02 0.97 \n", + "\n", + " pct_rt_journey_atleast1_vp pct_rt_journey_atleast2_vp \\\n", + "0 1.00 1.00 \n", + "\n", + " pct_sched_journey_atleast1_vp pct_sched_journey_atleast2_vp \\\n", + "0 1.00 1.00 \n", + "\n", + " rt_sched_journey_difference is_early is_ontime is_late \\\n", + "0 37.25 0 0 1 \n", + "\n", + " name organization_name \\\n", + "0 Bay Area 511 Muni Schedule City and County of San Francisco \n", + "\n", + " caltrans_district \n", + "0 04 - Oakland " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7a575da4-2efc-44f4-9c56-9d3277d806a6", + "metadata": {}, + "outputs": [], + "source": [ + "groupby_cols = [\n", + " \"caltrans_district\",\n", + " \"organization_name\",\n", + " \"schedule_gtfs_dataset_key\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "42c7e971-23e7-40b9-9386-47c5b1b47b9d", + "metadata": {}, + "outputs": [], + "source": [ + "agg1 = (\n", + " df2.groupby(groupby_cols)\n", + " .agg(\n", + " {\n", + " \"total_vp\": \"sum\",\n", + " \"vp_in_shape\": \"sum\",\n", + " \"rt_service_minutes\": \"sum\",\n", + " }\n", + " )\n", + " .reset_index()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a738e9fd-143a-497c-b7af-a0884e7346d7", + "metadata": {}, + "outputs": [], + "source": [ + "agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.rt_service_minutes)).round(2)\n", + "agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c0c61998-c937-432e-bcf3-4850eb300c60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caltrans_districtorganization_nameschedule_gtfs_dataset_keytotal_vpvp_in_shapert_service_minutesvp_per_min_agencyspatial_accuracy_agency
001 - EurekaCity of Arcataa253a8d7acd57657bb98050f37dd6b0f381421793513367.502.8547.02
101 - EurekaCity of Eurekaa253a8d7acd57657bb98050f37dd6b0f381421793513367.502.8547.02
201 - EurekaHumboldt Transit Authoritya253a8d7acd57657bb98050f37dd6b0f381421793513367.502.8547.02
301 - EurekaLake Transit Authority0a3c0b21c85fb09f8db91599e14dd7f711572112235015.752.3196.98
401 - EurekaMendocino Transit Authority770072d7a8d356b529ef34fe01715bcb16196137026699.272.4284.60
\n", + "
" + ], + "text/plain": [ + " caltrans_district organization_name \\\n", + "0 01 - Eureka City of Arcata \n", + "1 01 - Eureka City of Eureka \n", + "2 01 - Eureka Humboldt Transit Authority \n", + "3 01 - Eureka Lake Transit Authority \n", + "4 01 - Eureka Mendocino Transit Authority \n", + "\n", + " schedule_gtfs_dataset_key total_vp vp_in_shape \\\n", + "0 a253a8d7acd57657bb98050f37dd6b0f 38142 17935 \n", + "1 a253a8d7acd57657bb98050f37dd6b0f 38142 17935 \n", + "2 a253a8d7acd57657bb98050f37dd6b0f 38142 17935 \n", + "3 0a3c0b21c85fb09f8db91599e14dd7f7 11572 11223 \n", + "4 770072d7a8d356b529ef34fe01715bcb 16196 13702 \n", + "\n", + " rt_service_minutes vp_per_min_agency spatial_accuracy_agency \n", + "0 13367.50 2.85 47.02 \n", + "1 13367.50 2.85 47.02 \n", + "2 13367.50 2.85 47.02 \n", + "3 5015.75 2.31 96.98 \n", + "4 6699.27 2.42 84.60 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agg1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ee228e7c-6479-4b9f-be77-367f50ea46e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
28
caltrans_district04 - Oakland
organization_nameMarin County Transit District
schedule_gtfs_dataset_key015d67d5b75b5cf2b710bbadadfb75f5
total_vp97768
vp_in_shape63718
rt_service_minutes36831.13
vp_per_min_agency2.65
spatial_accuracy_agency65.17
\n", + "
" + ], + "text/plain": [ + " 28\n", + "caltrans_district 04 - Oakland\n", + "organization_name Marin County Transit District\n", + "schedule_gtfs_dataset_key 015d67d5b75b5cf2b710bbadadfb75f5\n", + "total_vp 97768\n", + "vp_in_shape 63718\n", + "rt_service_minutes 36831.13\n", + "vp_per_min_agency 2.65\n", + "spatial_accuracy_agency 65.17" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agg1.loc[agg1.organization_name == \"Marin County Transit District\"].T" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py b/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py index 0e6a2555e..25a0d2488 100644 --- a/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py +++ b/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py @@ -12,18 +12,27 @@ from update_vars import RT_SCHED_GCS, GTFS_DATA_DICT from shared_utils import rt_dates -def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame: +def operator_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame: start = datetime.datetime.now() - ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics - AGENCY_EXPORT = dict_inputs.vp_agency_metrics + TRIP_EXPORT = dict_inputs.vp_trip_metrics + OP_EXPORT = dict_inputs.vp_operator_metrics # Read in dataframe. - df = pd.read_parquet(f"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet") - - # Keep only all_day. - df = df.loc[df["time_period"] == "all_day"].reset_index(drop=True) - + df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet") + + # Merge in identifiers + crosswalk_cols = [ + "schedule_gtfs_dataset_key", + "name", + "organization_name", + "caltrans_district",] + + df2 = gtfs_schedule_wrangling.merge_operator_identifiers( + df, + [analysis_date], + columns = crosswalk_cols) + # Aggregate groupby_cols = [ "caltrans_district", @@ -31,21 +40,17 @@ def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame: "schedule_gtfs_dataset_key", ] - sum_cols = ["total_vp", "vp_in_shape", "total_rt_service_minutes"] - agg1 = df.groupby(groupby_cols).agg({**{e: "sum" for e in sum_cols}}).reset_index() + sum_cols = ["total_vp", "vp_in_shape", "rt_service_minutes"] + agg1 = df2.groupby(groupby_cols).agg({**{e: "sum" for e in sum_cols}}).reset_index() - agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round( - 2 - ) - agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round( - 2 - ) + agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.rt_service_minutes)).round(2) + agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2) - # Cleanrt_V + # Clean agg1 = agg1.drop(columns=sum_cols) - # Save: take out test later - agg1.to_parquet(f"{RT_SCHED_GCS}{AGENCY_EXPORT}_TEST_{analysis_date}.parquet") + # Save + agg1.to_parquet(f"{RT_SCHED_GCS}{OP_EXPORT}_{analysis_date}.parquet") end = datetime.datetime.now() logger.info(f"agency aggregation {analysis_date}: {end - start}") @@ -54,7 +59,7 @@ def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame: if __name__ == "__main__": - LOG_FILE = "../logs/rt_v_scheduled_agency_metrics.log" + LOG_FILE = "../logs/rt_v_scheduled_operator_metrics.log" logger.add(LOG_FILE, retention="3 months") logger.add(sys.stderr, format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", @@ -65,4 +70,4 @@ def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame: dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables for analysis_date in analysis_date_list: - agency_metrics(analysis_date, dict_inputs) \ No newline at end of file + operator_metrics(analysis_date, dict_inputs) \ No newline at end of file diff --git a/rt_scheduled_v_ran/scripts/update_vars.py b/rt_scheduled_v_ran/scripts/update_vars.py index 455bbd82b..1f8daf291 100644 --- a/rt_scheduled_v_ran/scripts/update_vars.py +++ b/rt_scheduled_v_ran/scripts/update_vars.py @@ -6,7 +6,7 @@ apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True) # analysis_date_list = [rt_dates.DATES["sep2024"]] -analysis_date_list = rt_dates.y2024_dates +analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")