diff --git a/_shared_utils/shared_utils/gtfs_analytics_data.yml b/_shared_utils/shared_utils/gtfs_analytics_data.yml
index ea7561ded..48cf97884 100644
--- a/_shared_utils/shared_utils/gtfs_analytics_data.yml
+++ b/_shared_utils/shared_utils/gtfs_analytics_data.yml
@@ -68,6 +68,7 @@ digest_tables:
operator_profiles: "digest/operator_profiles"
operator_routes_map: "digest/operator_routes"
operator_sched_rt: "digest/operator_schedule_rt_category"
+ operator_metrics: "digest/operator_metrics"
scheduled_service_hours: "digest/total_scheduled_service_hours"
stop_segments:
diff --git a/gtfs_digest/03_report.ipynb b/gtfs_digest/03_report.ipynb
index 94e1a4530..e5ab9fea9 100644
--- a/gtfs_digest/03_report.ipynb
+++ b/gtfs_digest/03_report.ipynb
@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"id": "6bd20d9d-a3af-430e-8c19-c90fb8ef9e62",
"metadata": {
"tags": [
@@ -60,20 +60,20 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 4,
"id": "d870c492-ef2c-45f6-ab47-8d46eda7f344",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
- "%%capture_parameters\n",
- "organization_name"
+ " %%capture_parameters\n",
+ " organization_name"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 5,
"id": "b8e11fd2-041f-4e1d-a00f-6e000269c1a7",
"metadata": {},
"outputs": [],
@@ -84,7 +84,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"id": "8e840f91-2e1a-4235-bf6b-0c049a569b4a",
"metadata": {},
"outputs": [],
@@ -96,7 +96,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"id": "517702ae-a7ac-4cc4-a2d4-158fdc8d6919",
"metadata": {},
"outputs": [],
@@ -106,7 +106,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"id": "dd8b7a4c-7682-4949-9e9b-990ce6867627",
"metadata": {},
"outputs": [],
@@ -114,6 +114,20 @@
"scheduled_service = section1.load_operator_service_hours(name)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "72641974-01d5-47cf-b963-baf546e9e958",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Dataset with agency metrics\n",
+ "try:\n",
+ " agency_metrics_df = section2.load_operator_metrics(organization_name)\n",
+ "except:\n",
+ " pass"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 10,
@@ -342,23 +356,23 @@
"text/html": [
"\n",
"\n",
- "
\n",
+ "\n",
""
],
"text/plain": [
@@ -1094,7 +1108,7 @@
],
"source": [
"try:\n",
- " display(section2.agency_overview(sched_vp_df))\n",
+ " display(section2.agency_overview(agency_metrics_df))\n",
"except:\n",
" display(Markdown(f\"\"\"{organization_name} only has schedule data.\"\"\"))"
]
@@ -1118,23 +1132,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "agency_spatial_chart"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "80222431-fb3c-4528-8cd9-ca7af926466d",
"metadata": {},
"outputs": [],
"source": [
- "def simple_bar_chart(\n",
- " df: pd.DataFrame,\n",
- " y_col: str,\n",
- " ruler_col: str,\n",
- " title: str,\n",
- " subtitle: str,\n",
- " domain_color:list,\n",
- " range_color:list,\n",
- ") -> alt.Chart:\n",
- " tooltip_cols = [\n",
- " \"Date\",\n",
- " y_col,\n",
- " ]\n",
- " \n",
- " # Set y-axis\n",
- " max_y = _section2_utils.set_y_axis(df, y_col)\n",
- " \n",
- " # Create color scale\n",
- " color_scale = alt.Scale(\n",
- " domain= domain_color,\n",
- " range = range_color\n",
- " )\n",
- " \n",
- " # Create ruler\n",
- " ruler = (\n",
- " alt.Chart(df)\n",
- " .mark_rule(color=\"red\", strokeDash=[10, 7])\n",
- " .encode(y=f\"mean({ruler_col}):Q\")\n",
- " )\n",
- " \n",
- " chart = (\n",
- " alt.Chart(df)\n",
- " .mark_bar(size=7, clip=True)\n",
- " .encode(\n",
- " x=alt.X(\n",
- " \"yearmonthdate(Date):O\",\n",
- " title=[\"Date\"],\n",
- " axis=alt.Axis(labelAngle=-45, format=\"%b %Y\"),\n",
- " ),\n",
- " y=alt.Y(\n",
- " f\"{y_col}:Q\",\n",
- " title=_report_utils.labeling(y_col),\n",
- " scale=alt.Scale(domain=[0, max_y]),\n",
- " ),\n",
- " color=alt.Color(\n",
- " f\"{y_col}:Q\",\n",
- " title=_report_utils.labeling(y_col),\n",
- " scale=color_scale,\n",
- " ),\n",
- " tooltip=df[tooltip_cols].columns.tolist(),\n",
+ "agency_vp_chart = (\n",
+ " (\n",
+ " section2.simple_bar_chart(\n",
+ " df,\n",
+ " \"VP per Minute (All Routes)\",\n",
+ " \"ruler_for_vp_per_min\",\n",
+ " readable_dict[\"agency_vp_per_min_graph\"][\"title\"],\n",
+ " readable_dict[\"vp_per_min_graph\"][\"subtitle\"],\n",
+ " color_dict[\"vp_domain\"],\n",
+ " color_dict[\"vp_range\"]\n",
" )\n",
" )\n",
- "\n",
- " chart = (chart + ruler).properties(width=400, height=250,\n",
- " title={\n",
- " \"text\": title,\n",
- " \"subtitle\": [subtitle],\n",
- " }\n",
- " )\n",
- "\n",
- " return chart"
+ " )"
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "cafb446d-f0a6-4130-9bea-603081a967d7",
+ "execution_count": 12,
+ "id": "b93659f6-5b53-4384-ae26-eb946a2bf993",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "agency_vp_chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9dbdbd09-42b1-470e-8b91-3b8cdcd11ee7",
+ "metadata": {},
+ "source": [
+ "### Look at City of Visalia\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "d8456dd5-9ea0-459f-8af0-4101eacbede7",
"metadata": {},
"outputs": [],
"source": [
- "def agency_overview(df:pd.DataFrame)->alt.Chart:\n",
- " agg1 = aggregate_by_agency(df)\n",
- " \n",
- " agency_spatial_chart = (\n",
- " simple_bar_chart(\n",
- " agg1,\n",
+ "df2 = load_operator_metrics(\"City of Visalia\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "15f80dda-e628-4bc9-bf5d-160411490bd1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " (\n",
+ " section2.simple_bar_chart(\n",
+ " df2,\n",
+ " \"VP per Minute (All Routes)\",\n",
+ " \"ruler_for_vp_per_min\",\n",
+ " readable_dict[\"agency_vp_per_min_graph\"][\"title\"],\n",
+ " readable_dict[\"vp_per_min_graph\"][\"subtitle\"],\n",
+ " color_dict[\"vp_domain\"],\n",
+ " color_dict[\"vp_range\"]\n",
+ " )\n",
+ " )\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "14559142-ba86-4a4b-880c-1b7578ae5b66",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
+ " section2.simple_bar_chart(\n",
+ " df2,\n",
" \"Spatial Accuracy (All Routes)\",\n",
" \"ruler_100_pct\",\n",
" readable_dict[\"agency_spatial_accuracy\"][\"title\"],\n",
@@ -200,11 +567,117 @@
" color_dict[\"spatial_accuracy_range\"]\n",
" )\n",
" )\n",
- " \n",
- " agency_vp_chart = (\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41420e6e-318c-4006-a475-bb7709d93820",
+ "metadata": {},
+ "source": [
+ "### Orange County Transportation Authority"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "8c9bffb8-5e74-4592-984c-514ab8a3f166",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df3 = load_operator_metrics(\"Orange County Transportation Authority\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "46025ae3-59c6-471c-bb3c-6f2f5838ba99",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.LayerChart(...)"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(\n",
" (\n",
- " simple_bar_chart(\n",
- " agg1,\n",
+ " section2.simple_bar_chart(\n",
+ " df3,\n",
" \"VP per Minute (All Routes)\",\n",
" \"ruler_for_vp_per_min\",\n",
" readable_dict[\"agency_vp_per_min_graph\"][\"title\"],\n",
@@ -213,19 +686,13 @@
" color_dict[\"vp_range\"]\n",
" )\n",
" )\n",
- " )\n",
- " \n",
- " chart_list = [agency_spatial_chart, agency_vp_chart]\n",
- " chart = alt.vconcat(*chart_list).resolve_scale(\n",
- " color='independent')\n",
- "\n",
- " return chart"
+ " )"
]
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "65259c01-bd91-4200-9572-bfdd5e7c6f98",
+ "execution_count": 18,
+ "id": "af863f21-ba7d-4980-ab56-c6fe6b20c087",
"metadata": {},
"outputs": [
{
@@ -233,23 +700,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
- "alt.VConcatChart(...)"
+ "alt.LayerChart(...)"
]
},
- "execution_count": 9,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "agency_overview(df)"
+ "(\n",
+ " section2.simple_bar_chart(\n",
+ " df3,\n",
+ " \"Spatial Accuracy (All Routes)\",\n",
+ " \"ruler_100_pct\",\n",
+ " readable_dict[\"agency_spatial_accuracy\"][\"title\"],\n",
+ " readable_dict[\"spatial_accuracy_graph\"][\"subtitle\"],\n",
+ " color_dict[\"spatial_accuracy_domain\"],\n",
+ " color_dict[\"spatial_accuracy_range\"]\n",
+ " )\n",
+ " )\n",
+ " "
]
}
],
diff --git a/gtfs_digest/35_agg_by_agency2.ipynb b/gtfs_digest/35_agg_by_agency2.ipynb
new file mode 100644
index 000000000..bc29f1a40
--- /dev/null
+++ b/gtfs_digest/35_agg_by_agency2.ipynb
@@ -0,0 +1,70 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2682013d-2d46-4584-a421-38b4f9fe9a13",
+ "metadata": {},
+ "source": [
+ "## Use `vp_trips as the jumping off point`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "49127032-4559-415b-b7c9-9d3fb9e8c50c",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ModuleNotFoundError",
+ "evalue": "No module named 'shared_utils'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_aggregate_agency\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_report_utils\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_section1_utils\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msection1\u001b[39;00m\n",
+ "File \u001b[0;32m~/data-analyses/gtfs_digest/_aggregate_agency.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01m_operators_prep\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgeopandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mgpd\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n",
+ "File \u001b[0;32m~/data-analyses/gtfs_digest/_operators_prep.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mshared_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m catalog_utils\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n",
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'shared_utils'"
+ ]
+ }
+ ],
+ "source": [
+ "import _aggregate_agency\n",
+ "import _report_utils\n",
+ "import _section1_utils as section1\n",
+ "import _section2_utils as section2\n",
+ "import geopandas as gpd\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "993d3d22-1fd2-46c2-a288-85cd5c0021fa",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/gtfs_digest/_archive_section2_utils.py b/gtfs_digest/_archive_section2_utils.py
deleted file mode 100644
index 219f8fb2b..000000000
--- a/gtfs_digest/_archive_section2_utils.py
+++ /dev/null
@@ -1,1009 +0,0 @@
-import calitp_data_analysis.magics
-import geopandas as gpd
-import pandas as pd
-
-# Charts
-from calitp_data_analysis import calitp_color_palette as cp
-import altair as alt
-alt.data_transformers.enable('default', max_rows=None)
-
-# Great Tables
-import great_tables as gt
-from great_tables import md
-
-# Display
-from IPython.display import HTML, Markdown, display
-
-# Other
-from segment_speed_utils.project_vars import RT_SCHED_GCS, SCHED_GCS
-from shared_utils import catalog_utils, rt_dates, rt_utils
-GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
-
-import _report_utils
-"""
-Schedule_vp_metrics
-Functions
-"""
-def timeliness_tags(row):
- if row.rt_sched_journey_ratio < 1:
- return "Early"
- elif row.rt_sched_journey_ratio < 1.1:
- return "On Time"
- elif 1.1 <= row.rt_sched_journey_ratio < 1.26:
- return "Late by 1-25% of the scheduled time"
- elif 1.26 <= row.rt_sched_journey_ratio < 1.51:
- return "Late by 26-50% of the scheduled time"
- elif 1.51 <= row.rt_sched_journey_ratio:
- return "Late by 50%+ of the scheduled time"
- else:
- return "No Info"
-
-def frequency_tags(row):
- if row.frequency < 2:
- return "<1 trip/hour"
- elif 1 <= row.frequency < 2:
- return "1 trip/hour"
- elif 2 <= row.frequency < 3:
- return "2 trips/hour"
- elif 3 <= row.frequency:
- return "3+ trips/hour"
- else:
- return "No Info"
-
-
-def vp_per_min_tag(row):
- if row.vp_per_minute < 1:
- return "<1 ping/minute"
- elif 1 <= row.vp_per_minute < 2:
- return "<3 pings/minute"
- elif 2 <= row.vp_per_minute < 3:
- return "<3 pings/minute"
- elif 3 <= row.vp_per_minute:
- return "3+ pings per minute (target)"
- else:
- return "No Info"
-
-def add_categories(df:pd.DataFrame) -> pd.DataFrame:
- df["rt_sched_journey_ratio_cat"] = df.apply(timeliness_tags, axis=1)
- df["frequency_cat"] = df.apply(frequency_tags, axis=1)
- df["vp_per_minute_cat"] = df.apply(vp_per_min_tag, axis=1)
-
- return df
-
-def load_schedule_vp_metrics(name:str)->pd.DataFrame:
- schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"
-
- df = pd.read_parquet(schd_vp_url, filters=[[("name", "==", name)]])
-
- # Categorize
- df = add_categories(df)
-
- # Round float columns
- float_columns = df.select_dtypes(include=['float'])
- for i in float_columns:
- df[i] = df[i].round(2)
-
- pct_cols = df.columns[df.columns.str.contains("pct")].tolist()
- for i in pct_cols:
- df[i] = df[i] * 100
-
- # Add rulers
- df["ruler_100_pct"] = 100
- df["ruler_for_vp_per_min"] = 2
- return df
-
-def route_stats(df: pd.DataFrame) -> pd.DataFrame:
- most_recent_date = df.service_date.max()
- route_merge_cols = ["route_combined_name", "direction_id"]
-
- all_day_stats = df[
- (df.service_date == most_recent_date) & (df.time_period == "all_day")
- ][
- route_merge_cols
- + [
- "avg_scheduled_service_minutes",
- "avg_stop_miles",
- "n_scheduled_trips",
- "sched_rt_category",
- ]
- ]
-
- peak_stats = df[(df.service_date == most_recent_date) & (df.time_period == "peak")][
- route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]
- ].rename(
- columns={
- "speed_mph": "peak_avg_speed",
- "n_scheduled_trips": "peak_scheduled_trips",
- "frequency": "peak_hourly_freq",
- }
- )
-
- offpeak_stats = df[
- (df.service_date == most_recent_date) & (df.time_period == "offpeak")
- ][route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]].rename(
- columns={
- "speed_mph": "offpeak_avg_speed",
- "n_scheduled_trips": "offpeak_scheduled_trips",
- "frequency": "offpeak_hourly_freq",
- }
- )
-
- table_df = (
- pd.merge(
- all_day_stats,
- peak_stats,
- on=route_merge_cols,
- how = "outer"
- )
- .merge(offpeak_stats, on=route_merge_cols, how = "outer")
- .sort_values(["route_combined_name", "direction_id"])
- .reset_index(drop=True)
- )
-
- numeric_cols = table_df.select_dtypes(include="number").columns
- table_df[numeric_cols] = table_df[numeric_cols].fillna(0)
-
- return table_df
-
-def timeliness_trips(df: pd.DataFrame):
- to_keep = [
- "service_date",
- "organization_name",
- "direction_id",
- "time_period",
- "route_combined_name",
- "is_early",
- "is_ontime",
- "is_late",
- "n_vp_trips",
- ]
- df = df[to_keep]
- df2 = df.loc[df.time_period != "all_day"].reset_index(drop=True)
-
- melted_df = df2.melt(
- id_vars=[
- "service_date",
- "organization_name",
- "route_combined_name",
- "time_period",
- "direction_id",
- ],
- value_vars=["is_early", "is_ontime", "is_late"],
- )
- return melted_df
-
-def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
- to_keep = [
- "service_date",
- "organization_name",
- "direction_id",
- col1,
- col2,
- "route_combined_name",
- "time_period",
- "route_id",
- "ruler_100_pct",
- ]
- df2 = df[to_keep]
-
- df3 = df2.melt(
- id_vars=[
- "service_date",
- "organization_name",
- "route_combined_name",
- "direction_id",
- "time_period",
- "route_id",
- "ruler_100_pct",
- ],
- value_vars=[col1, col2],
- )
-
- return df3
-
-"""
-Operator Level
-"""
-def trips_by_gtfs(df):
- df = df.loc[df.time_period=="all_day"]
-
- by_date_category = (
- pd.crosstab(
- df.service_date,
- df.sched_rt_category,
- values=df.n_scheduled_trips,
- aggfunc="sum",
- )
- .reset_index()
- .fillna(0))
-
- display(gt.GT(by_date_category, rowname_col="service_date")
- .tab_header(
- title="Daily Trips by GTFS Availability",
- subtitle="Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data.",
- )
- .cols_label(
- schedule_only="Schedule Only",
- vp_only="VP Only",
- schedule_and_vp="Schedule and VP",
- )
- .fmt_integer(["schedule_only", "vp_only", "schedule_and_vp"])
- .tab_options(container_width="75%")
- .tab_options(table_font_size="12px"))
-
-"""
-operator_schedule_rt_category
-"""
-def load_operator_schedule_rt_category(schedule_gtfs_key: list) -> pd.DataFrame:
- df = pd.read_parquet(
- f"{RT_SCHED_GCS}digest/operator_schedule_rt_category.parquet",
- filters=[[("schedule_gtfs_dataset_key", "in", schedule_gtfs_key)]],
- )
- df.n_trips = df.n_trips.astype(int).fillna(0)
- return df
-
-
-"""
-Charts
-"""
-def create_data_unavailable_chart():
- data = pd.DataFrame({"text": ["Chart unavailable, not enough data."]})
-
- # Create a text chart using Altair
- chart = (
- alt.Chart(data)
- .mark_text(
- align="center",
- baseline="middle",
- fontSize=12,
- text="Chart unavailable due to lack of data",
- )
- .properties(width=500, height=100)
- )
-
- return chart
-
-def clean_data_charts(df:pd.DataFrame, y_col:str)->pd.DataFrame:
- df = df.assign(
- time_period=df.time_period.str.replace("_", " ").str.title()
- ).reset_index(drop=True)
-
- df[y_col] = df[y_col].fillna(0).astype(int)
- df[f"{y_col}_str"] = df[y_col].astype(str)
-
-
- return df
-
-def grouped_bar_chart(
- df: pd.DataFrame,
- color_col: str,
- y_col: str,
- offset_col: str,
- title: str,
- subtitle: str,
-):
- tooltip_cols = [
- "direction_id",
- "time_period",
- "route_combined_name",
- "organization_name",
- "service_date",
- color_col,
- y_col,
- ]
-
- if len(df) == 0:
- text_chart = create_data_unavailable_chart()
- return text_chart
- else:
- df = clean_data_charts(df,y_col)
- chart = (
- alt.Chart(df)
- .mark_bar(size=10)
- .encode(
- x=alt.X(
- "yearmonthdate(service_date):O",
- title=["Grouped by Direction ID", "Date"],
- axis=alt.Axis(labelAngle=-45, format="%b %Y"),
- ),
- y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
- xOffset=alt.X(f"{offset_col}:N", title=_report_utils.labeling(offset_col)),
- color=alt.Color(
- f"{color_col}:N",
- title=_report_utils.labeling(color_col),
- scale=alt.Scale(
- range=_report_utils.red_green_yellow,
- ),
- ),
- tooltip=tooltip_cols,
- )
- )
- chart = (chart).properties(
- title={
- "text": [title],
- "subtitle": [subtitle],
- },
- width=500,
- height=300,
- )
-
- return chart
-
-def base_facet_line(
- df: pd.DataFrame, y_col: str, title: str, subtitle: str
-) -> alt.Chart:
- if len(df) == 0:
- text_chart = create_data_unavailable_chart()
- return text_chart
- else:
- selection = alt.selection_point(fields=['time_period'], bind='legend')
-
- df = clean_data_charts(df,y_col)
- tooltip_cols = [
- "route_combined_name",
- "route_id",
- "direction_id",
- "time_period",
- f"{y_col}_str",
- ]
- if "pct" in y_col:
- max_y = 100
- elif "per_minute" in y_col:
- max_y = round(df[y_col].max())
- else:
- max_y = round(df[y_col].max(), -1) + 5
- chart = (
- alt.Chart(df)
- .mark_line(size=5)
- .encode(
- x=alt.X(
- "yearmonthdate(service_date):O",
- title="Date",
- axis=alt.Axis(labelAngle=-45, format="%b %Y"),
- ),
- y=alt.Y(
- f"{y_col}:Q",
- title=_report_utils.labeling(y_col),
- scale=alt.Scale(domain=[0, max_y]),
- ),
- color=alt.Color(
- "time_period:N",
- title=_report_utils.labeling("time_period"),
- scale=alt.Scale(range=_report_utils.red_green_yellow),
- ),
-
- strokeWidth=alt.condition(
- "datum.time_peak == 'All Day'",
- alt.value(10),
- alt.value(1)),
-
- tooltip=tooltip_cols,
- )
- )
-
- chart = chart.properties(width=250, height=300)
- chart = chart.facet(
- column=alt.Column("direction_id:N", title=_report_utils.labeling("direction_id")),
- ).properties(
- title={
- "text": [title],
- "subtitle": [subtitle],
- }
- ).add_params(selection)
- return chart
-def base_facet_circle(
- df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str
-) -> alt.Chart:
-
- tooltip_cols = [
- "direction_id",
- "time_period",
- "route_combined_name",
- "service_date",
- f"{y_col}_str",
- "variable",
- ]
-
- if len(df) == 0:
- text_chart = create_data_unavailable_chart()
- return text_chart
- else:
- if "pct" in y_col:
- max_y = 100
- elif "per_minute" in y_col:
- max_y = round(df[y_col].max())
- else:
- max_y = round(df[y_col].max(), -1) + 5
- df = clean_data_charts(df,y_col)
- df = df.assign(
- variable=df.variable.str.replace("_", " ").str.title(),
- ).reset_index(drop=True)
- ruler = (
- alt.Chart(df)
- .mark_rule(color="red", strokeDash=[10, 7])
- .encode(y=f"ruler_100_pct:Q")
- )
-
- chart = (
- alt.Chart(df)
- .mark_circle(size=100)
- .encode(
- x=alt.X(
- "yearmonthdate(service_date):O",
- title="Date",
- axis=alt.Axis(labelAngle=-45, format="%b %Y"),
- ),
- y=alt.Y(
- f"{y_col}:Q",
- title=_report_utils.labeling(y_col),
- scale=alt.Scale(domain=[0, max_y]),
- ),
- color=alt.Color(
- "variable:N",
- title=_report_utils.labeling("variable"),
- scale=alt.Scale(range=_report_utils.red_green_yellow),
- ),
- tooltip=tooltip_cols,
- )
- )
-
- chart = chart + ruler
- chart = chart.facet(
- column=alt.Column("direction_id:N", title=_report_utils.labeling("direction_id")),
- ).properties(
- title={
- "text": [title],
- "subtitle": [subtitle],
- }
- )
- return chart
-def base_facet_chart(
- df: pd.DataFrame,
- y_col: str,
- color_col: str,
- facet_col: str,
- title: str,
- subtitle: str,
-):
- tooltip_cols = [
- "direction_id",
- "time_period",
- "route_combined_name",
- "organization_name",
- "service_date",
- y_col,
- color_col,
- ]
-
- if len(df) == 0:
- text_chart = create_data_unavailable_chart()
- return text_chart
- else:
- if "pct" in y_col:
- max_y = 100
- elif "per_minute" in y_col:
- max_y = round(df[y_col].max())
- else:
- max_y = round(df[y_col].max(), -1) + 5
- df = clean_data_charts(df,y_col)
- chart = (
- (
- alt.Chart(df)
- .mark_bar(size=15, clip=True)
- .encode(
- x=alt.X(
- "yearmonthdate(service_date):O",
- title=["Service Date"],
- axis=alt.Axis(labelAngle=-45, format="%b %Y"),
- ),
- y=alt.Y(
- f"{y_col}:Q",
- title=_report_utils.labeling(y_col),
- scale=alt.Scale(domain=[0, max_y]),
- ),
- color=alt.Color(
- f"{color_col}:N",
- title=_report_utils.labeling(color_col),
- scale=alt.Scale(range=_report_utils.red_green_yellow),
- ),
- tooltip=tooltip_cols,
- )
- )
- .facet(
- column=alt.Column(
- f"{facet_col}:N",
- )
- )
- .properties(
- title={
- "text": title,
- "subtitle": subtitle,
- }
- )
- )
- return chart
-
-def base_facet_with_ruler_chart(
- df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str
-):
- tooltip_cols = [
- "direction_id",
- "time_period",
- "route_combined_name",
- "organization_name",
- "service_date",
- y_col,
- ]
-
- if len(df) == 0:
- text_chart = create_data_unavailable_chart()
- return text_chart
- else:
- df = clean_data_charts(df,y_col)
- if "pct" in y_col:
- max_y = 100
- elif "per_minute" in y_col:
- max_y = round(df[y_col].max()) + 2
- else:
- max_y = round(df[y_col].max(), -1) + 5
- ruler = (
- alt.Chart(df)
- .mark_rule(color="red", strokeDash=[10, 7])
- .encode(y=f"mean({ruler_col}):Q")
- )
- chart = (
- alt.Chart(df)
- .mark_bar(size=15, clip=True)
- .encode(
- x=alt.X(
- "yearmonthdate(service_date):O",
- title=["Service Date"],
- axis=alt.Axis(labelAngle=-45, format="%b %Y"),
- ),
- y=alt.Y(
- f"{y_col}:Q",
- title=_report_utils.labeling(y_col),
- scale=alt.Scale(domain=[0, max_y]),
- ),
- color=alt.Color(
- f"{y_col}:Q",
- title=_report_utils.labeling(y_col),
- scale=alt.Scale(range=_report_utils.red_green_yellow),
- ),
- tooltip=df[tooltip_cols].columns.tolist(),
- )
- )
-
- chart = chart + ruler
- chart = chart.facet(column=alt.Column("direction_id:N",)).properties(
- title={
- "text": title,
- "subtitle": [subtitle],
- }
- )
-
- return chart
-
-def create_text_table(df: pd.DataFrame, direction_id: str):
-
- df = (
- df.loc[df.direction_id == direction_id].drop_duplicates().reset_index(drop=True)
- )
-
- if len(df) == 0:
- text_chart = create_data_unavailable_chart()
- return text_chart
-
- else:
- df2 = df.melt(
- id_vars=[
- "route_combined_name",
- "direction_id",
- ],
- value_vars=[
- "avg_scheduled_service_minutes",
- "avg_stop_miles",
- "n_scheduled_trips",
- "sched_rt_category",
- "peak_avg_speed",
- "peak_scheduled_trips",
- "peak_hourly_freq",
- "offpeak_avg_speed",
- "offpeak_scheduled_trips",
- "offpeak_hourly_freq",
- ],
- )
- # Create a decoy column to center all the text
- df2["Zero"] = 0
-
- df2.variable = df2.variable.str.replace("_", " ").str.title()
- df2 = df2.sort_values(by=["direction_id"]).reset_index(drop=True)
- df2["combo_col"] = df2.variable.astype(str) + ": " + df2.value.astype(str)
- text_chart = (
- alt.Chart(df2)
- .mark_text()
- .encode(x=alt.X("Zero:Q", axis=None), y=alt.Y("combo_col", axis=None))
- )
-
- text_chart = text_chart.encode(text="combo_col:N").properties(
- title=f"Route Statistics for Direction {direction_id}",
- width=500,
- height=300,
- )
- return text_chart
-
-def frequency_chart(df: pd.DataFrame):
- if len(df) == 0:
- text_chart = create_data_unavailable_chart()
- return text_chart
-
- else:
- chart = (
- alt.Chart(df, width=180, height=alt.Step(10))
- .mark_bar()
- .encode(
- alt.Y(
- "yearmonthdate(service_date):O",
- title="Date",
- axis=alt.Axis(format="%b %Y"),
- ),
- alt.X("frequency:Q", title=_report_utils.labeling("frequency"), axis=None),
- alt.Color("frequency", scale=alt.Scale(range=_report_utils.red_green_yellow)).title(
- _report_utils.labeling("Frequency")
- ),
- alt.Row("time_period:N")
- .title(_report_utils.labeling("time_period"))
- .header(labelAngle=0),
- alt.Column("direction_id:N").title(_report_utils.labeling("direction_id")),
- )
- )
-
- chart = chart.properties(title="Frequency of Trips per Hour")
- return chart
-"""
-Route-Direction
-Section
-"""
-def filtered_route(
- df: pd.DataFrame,
-) -> alt.Chart:
- """
- https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
- """
- # Filter for only schedule and vp
- df_sched_vp_both = df[df.sched_rt_category == "schedule_and_vp"].reset_index(
- drop=True
- )
- routes_list = df_sched_vp_both["route_combined_name"].unique().tolist()
-
-
- route_dropdown = alt.binding_select(
- options=routes_list,
- name="Routes",
- )
-
- # Column that controls the bar charts
- route_selector = alt.selection_point(
- fields=["route_combined_name"],
- bind=route_dropdown,
- )
-
- # Data
- # Filter for only rows categorized as found in schedule and vp and all_day
- all_day = df_sched_vp_both.loc[
- df_sched_vp_both.time_period == "all_day"
- ].reset_index(drop=True)
-
- # Create route stats table for the text tables
- route_stats_df = route_stats(df)
-
- # Manipulate the df for some of the metrics
- timeliness_df = timeliness_trips(df_sched_vp_both)
- rt_journey_vp = pct_vp_journey(
- all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
- )
- sched_journey_vp = pct_vp_journey(
- all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
- )
-
- # Charts
- avg_scheduled_min = (
- grouped_bar_chart(
- df=all_day.drop_duplicates(),
- color_col="direction_id",
- y_col="avg_scheduled_service_minutes",
- offset_col="direction_id",
- title="Average Scheduled Minutes",
- subtitle="The average minutes a trip is scheduled to run.",
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
-
- timeliness_trips_dir_0 = (
- (
- base_facet_chart(
- timeliness_df.loc[timeliness_df.direction_id == 0].drop_duplicates(),
- "value",
- "variable",
- "time_period",
- "Breakdown of Trips by Categories for Direction 0",
- "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
- )
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
- timeliness_trips_dir_1 = (
- (
- base_facet_chart(
- timeliness_df.loc[timeliness_df.direction_id == 1].drop_duplicates(),
- "value",
- "variable",
- "time_period",
- "Breakdown of Trips by Categories for Direction 1",
- "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
- )
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
-
- frequency = (
- frequency_chart(df_sched_vp_both)
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
- speed = (
- base_facet_line(
- df_sched_vp_both,
- "speed_mph",
- "Average Speed",
- "The average miles per hour the bus travels by direction and time of day.",
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
-
- vp_per_min = (
- (
- base_facet_with_ruler_chart(
- all_day.drop_duplicates(),
- "vp_per_minute",
- "ruler_for_vp_per_min",
- "Vehicle Positions per Minute",
- "Trips should have 2+ vehicle positions per minute.",
- )
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
-
- rt_vp_per_min = (
- base_facet_circle(
- rt_journey_vp,
- "value",
- "ruler_100_pct",
- "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions",
- "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
- sched_vp_per_min = (
- base_facet_circle(
- sched_journey_vp,
- "value",
- "sched_journey_vp",
- "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",
- "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
- spatial_accuracy = (
- base_facet_with_ruler_chart(
- all_day.drop_duplicates(),
- "pct_in_shape",
- "ruler_100_pct",
- "Spatial Accuracy",
- "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.",
- )
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
-
- text_dir0 = (
- (create_text_table(route_stats_df, 0))
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
- text_dir1 = (
- create_text_table(route_stats_df, 1)
- .add_params(route_selector)
- .transform_filter(route_selector)
- )
- chart_list = [
- avg_scheduled_min,
- timeliness_trips_dir_0,
- timeliness_trips_dir_1,
- frequency,
- speed,
- vp_per_min,
- rt_vp_per_min,
- sched_vp_per_min,
- spatial_accuracy,
- text_dir0,
- text_dir1,
- ]
-
- chart = alt.vconcat(*chart_list).properties(
- resolve=alt.Resolve(
- scale=alt.LegendResolveMap(color=alt.ResolveMode("independent"))
- )
- )
- return chart
-
-### Section 1
-def summarize_monthly(df:pd.DataFrame)->pd.DataFrame:
- df2 = (
- df.groupby(
- ['name', 'month','time_of_day', 'day_name']
- )
- .agg(
- {
- "ttl_service_hours": "sum",
- }
- )
- .reset_index()
- )
-
- return df2
-
-def convert_to_timestamps(datetime_list):
- timestamps = []
- for dt in datetime_list:
- timestamp = dt.astype("datetime64[s]").astype(datetime)
- timestamps.append(timestamp)
- return timestamps
-
-def count_days_in_months(dates: list) -> pd.DataFrame:
- # Turn list from numpy datetime to timestamp
- dates2 = convert_to_timestamps(dates)
- # Initialize a dictionary to store counts for each day of the week
- day_counts = {}
-
- # Iterate over each date
- for date in dates2:
- year = date.year
- month = date.month
-
- # Initialize counts dictionary for the current month-year combination
- if (year, month) not in day_counts:
- day_counts[(year, month)] = {
- "Monday": 0,
- "Tuesday": 0,
- "Wednesday": 0,
- "Thursday": 0,
- "Friday": 0,
- "Saturday": 0,
- "Sunday": 0,
- }
-
- # Get the calendar matrix for the current month and year
- matrix = calendar.monthcalendar(year, month)
-
- # Iterate over each day in the matrix
- for week in matrix:
- for i, day in enumerate(week):
- # Increment the count for the corresponding day of the week
- if day != 0:
- weekday = calendar.day_name[i]
- day_counts[(year, month)][weekday] += 1
-
- # Convert the dictionary to a pandas DataFrame
- df = pd.DataFrame.from_dict(day_counts, orient="index")
- df = df.reset_index()
- df["level_1"] = df["level_1"].astype(str).str.zfill(2)
- df["month"] = df.level_0.astype(str) + "-" + df.level_1.astype(str)
- df = df.drop(columns=["level_0", "level_1"])
-
- # Melt from wide to long
- df2 = pd.melt(
- df,
- id_vars=["month"],
- value_vars=[
- "Monday",
- "Tuesday",
- "Wednesday",
- "Thursday",
- "Friday",
- "Saturday",
- "Sunday",
- "month",
- ])
-
- df2 = df2.rename(columns = {"variable":"day_name", "value":"n_days"})
- return df2
-
-def total_monthly_service(name:str) ->pd.DataFrame:
-
- df = load_scheduled_service(name)
-
- # Grab unique dates
- unique_dates = list(df.datetime_date.unique())
-
- # Find number of Monday's, Tuesday's...etc in each date
- month_days_df = count_days_in_months(unique_dates)
-
- # Aggregate the original dataframe
- agg_df = summarize_monthly(df)
-
- # Merge on number of day types
- agg_df = pd.merge(agg_df, month_days_df, on =["month", "day_name"], how = "left")
-
- # Find daily service hours
- agg_df["Daily Service Hours"] = agg_df.ttl_service_hours / agg_df.n_days
-
- # Rename columns
- agg_df.columns = agg_df.columns.map(_report_utils.replace_column_names)
-
- return agg_df
-
-def single_bar_chart_dropdown(
- df: pd.DataFrame,
- x_col: str,
- y_col: str,
- offset_col: str,
- title: str,
- dropdown_col: str,
- subtitle:str
-):
- dropdown_list = df[dropdown_col].unique().tolist()
- dropdown_list.sort(reverse=True)
- dropdown = alt.binding_select(options=dropdown_list, name=_report_utils.labeling(dropdown_col))
-
- selector = alt.selection_point(
- name=_report_utils.labeling(dropdown_col), fields=[dropdown_col], bind=dropdown
- )
-
- chart = (
- alt.Chart(df)
- .mark_bar()
- .encode(
- x=alt.X(
- f"{x_col}:N",
- title="Day",
- scale=alt.Scale(
- domain=[
- "Monday",
- "Tuesday",
- "Wednesday",
- "Thursday",
- "Friday",
- "Saturday",
- "Sunday",
- ]
- ),
- ),
- y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
- xOffset=f"{offset_col}:N",
- color=alt.Color(
- f"{offset_col}:N",
- title=_report_utils.labeling(offset_col),
- scale=alt.Scale(
- range=color_dict["full_color_scale"],
- ),
- ),
- tooltip=df.columns.tolist(),
- )
- )
- chart = chart.properties(
- title = {
- "text": [title],
- "subtitle": [subtitle],
- }, width=400, height=250)
- chart = chart.add_params(selector).transform_filter(selector)
-
- display(chart)
-
\ No newline at end of file
diff --git a/gtfs_digest/_section1_utils.py b/gtfs_digest/_section1_utils.py
index 371b5e4cc..193bba0a6 100644
--- a/gtfs_digest/_section1_utils.py
+++ b/gtfs_digest/_section1_utils.py
@@ -105,6 +105,19 @@ def load_operator_service_hours(name:str)->pd.DataFrame:
df.columns = df.columns.map(_report_utils.replace_column_names)
return df
+def load_operator_metrics(name:str)->pd.DataFrame:
+ """
+ Load dataframe with the total scheduled service hours
+ a transit operator.
+ """
+ url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet"
+
+ df = pd.read_parquet(url,
+ filters=[[(("name", "==", name))]])
+
+ # Rename dataframe
+ df.columns = df.columns.map(_report_utils.replace_column_names)
+ return df
"""
Data Manipulation
Change dataframes from long to wide
diff --git a/gtfs_digest/_section2_utils.py b/gtfs_digest/_section2_utils.py
index 510b83beb..0ad947a35 100644
--- a/gtfs_digest/_section2_utils.py
+++ b/gtfs_digest/_section2_utils.py
@@ -67,6 +67,24 @@ def load_schedule_vp_metrics(organization:str)->pd.DataFrame:
return df
+def load_operator_metrics(organization_name:str)->pd.DataFrame:
+ """
+ Load dataframe with the total scheduled service hours
+ a transit operator.
+ """
+ url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_metrics}.parquet"
+
+ df = pd.read_parquet(url,
+ filters=[[(("organization_name", "==", organization_name))]])
+
+ # Rename dataframe
+ df.columns = df.columns.map(_report_utils.replace_column_names)
+
+ df["ruler_100_pct"] = 100
+
+ df["ruler_for_vp_per_min"] = 2
+ return df
+
"""
Data Manipulation
"""
@@ -199,41 +217,6 @@ def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
)
return df3
-def aggregate_by_agency(df: pd.DataFrame) -> pd.DataFrame:
- """
- Aggregate some of the metrics for all the routes
- across the agency.
- """
- # Filter to all day to avoid double counting
- df = df.loc[df["Period"] == "all_day"].reset_index(drop=True)
-
- # Aggregate by totals by date
- agg1 = (
- df.groupby(["Date"])
- .agg(
- {
- "# VP": "sum",
- "# VP within Scheduled Shape": "sum",
- "Aggregate Actual Service Minutes": "sum",
- "ruler_100_pct":"max",
- "ruler_for_vp_per_min":"max"
- }
- )
- .reset_index()
- )
-
- # Find metrics
- agg1["VP per Minute (All Routes)"] = (
- (agg1["# VP"] / agg1[ "Aggregate Actual Service Minutes"])
- ).round(2)
- agg1["Spatial Accuracy (All Routes)"] = ((
- agg1["# VP within Scheduled Shape"] / agg1["# VP"]
- ) * 100).round(2)
-
- # Sort the data
- agg1 = agg1.sort_values(by=["Date"]).reset_index(drop=True)
-
- return agg1
"""
Charts
"""
@@ -769,11 +752,9 @@ def simple_bar_chart(
Agency Metrics Overview Section
"""
def agency_overview(df:pd.DataFrame)->alt.Chart:
- agg1 = aggregate_by_agency(df)
- # display(agg1.head())
agency_spatial_chart = (
simple_bar_chart(
- agg1,
+ df,
"Spatial Accuracy (All Routes)",
"ruler_100_pct",
readable_dict["agency_spatial_accuracy"]["title"],
@@ -786,7 +767,7 @@ def agency_overview(df:pd.DataFrame)->alt.Chart:
agency_vp_chart = (
(
simple_bar_chart(
- agg1,
+ df,
"VP per Minute (All Routes)",
"ruler_for_vp_per_min",
readable_dict["agency_vp_per_min_graph"]["title"],
diff --git a/gtfs_digest/merge_operator_data.py b/gtfs_digest/merge_operator_data.py
index 3a489cd52..cd4cc271b 100644
--- a/gtfs_digest/merge_operator_data.py
+++ b/gtfs_digest/merge_operator_data.py
@@ -101,6 +101,24 @@ def operator_category_counts_by_date() -> pd.DataFrame:
return operator_category_counts
+def concatenate_operator_level_metrics(
+ date_list: list
+) -> pd.DataFrame:
+ """
+ Get spatial accuracy and VP per Minute metrics on the
+ operator-service_date grain.
+ """
+ FILE = f"{GTFS_DATA_DICT.rt_vs_schedule_tables.vp_operator_metrics}"
+
+ df = time_series_utils.concatenate_datasets_across_dates(
+ RT_SCHED_GCS,
+ FILE,
+ date_list,
+ data_type = "df",
+ ).sort_values(sort_cols).reset_index(drop=True)
+
+ return df
+
if __name__ == "__main__":
@@ -110,11 +128,18 @@ def operator_category_counts_by_date() -> pd.DataFrame:
OPERATOR_PROFILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map
+ OPERATOR_METRICS = GTFS_DATA_DICT.digest_tables.operator_metrics
SCHED_RT_CATEGORY = GTFS_DATA_DICT.digest_tables.operator_sched_rt
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()
+ # Concat operator metrics.
+ operator_metrics = concatenate_operator_level_metrics(analysis_date_list)
+ operator_metrics.to_parquet(
+ f"{RT_SCHED_GCS}{OPERATOR_METRICS}.parquet"
+ )
+
# Concat operator profiles
df = concatenate_operator_stats(analysis_date_list)
@@ -192,4 +217,5 @@ def operator_category_counts_by_date() -> pd.DataFrame:
operator_category_counts.to_parquet(
f"{RT_SCHED_GCS}{SCHED_RT_CATEGORY}.parquet"
)
+
\ No newline at end of file
diff --git a/gtfs_digest/readable.yml b/gtfs_digest/readable.yml
index 991561a89..57e4f9569 100644
--- a/gtfs_digest/readable.yml
+++ b/gtfs_digest/readable.yml
@@ -56,6 +56,8 @@ operator_n_trips: "# Trips"
operator_route_length_miles: "Operator Service Miles"
organization_name: Organization
organization_source_record_id: "Organization ID"
+vp_per_min_agency: "VP per Minute (All Routes)"
+spatial_accuracy_agency: "Spatial Accuracy (All Routes)"
# Dates / time
time_period: Period
diff --git a/rt_scheduled_v_ran/11_agency_agg.ipynb b/rt_scheduled_v_ran/11_agency_agg.ipynb
index 124f42ae8..fa30bfb9a 100644
--- a/rt_scheduled_v_ran/11_agency_agg.ipynb
+++ b/rt_scheduled_v_ran/11_agency_agg.ipynb
@@ -55,33 +55,13 @@
"id": "baf7ddd3-fd43-458a-9479-71bc9f7935db",
"metadata": {},
"source": [
- "### Exploring"
+ "### Exploring\n",
+ "* Need to filter for only (\"sched_rt_category\", \"==\", \"schedule_and_vp\") to get the same results as the one on analysis.calitp.org."
]
},
{
"cell_type": "code",
"execution_count": 3,
- "id": "00b45e96-315f-4f74-af8c-74eb994057ab",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "RT_SCHED_GCS"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
"id": "78b859a7-7598-4719-a806-887b31a5daa9",
"metadata": {},
"outputs": [],
@@ -91,174 +71,119 @@
},
{
"cell_type": "code",
- "execution_count": 5,
- "id": "a666f731-821a-4d7f-adc6-36ab7ee1428c",
- "metadata": {},
- "outputs": [],
- "source": [
- "ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "2d7af748-ceae-4a95-88e6-e24eb788a253",
+ "execution_count": 4,
+ "id": "b9cdee67-aafe-4e6a-adb9-5210e49bd82f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'vp_route_dir/route_direction_metrics'"
+ "['2024-01-17',\n",
+ " '2024-02-14',\n",
+ " '2024-03-13',\n",
+ " '2024-04-17',\n",
+ " '2024-05-22',\n",
+ " '2024-06-12',\n",
+ " '2024-07-17',\n",
+ " '2024-08-14',\n",
+ " '2024-09-18']"
]
},
- "execution_count": 6,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "ROUTE_EXPORT"
+ "rt_dates.y2024_dates"
]
},
{
"cell_type": "code",
- "execution_count": 33,
- "id": "7d685df6-f33c-430b-a878-22f7ce894aa2",
+ "execution_count": 5,
+ "id": "00b45e96-315f-4f74-af8c-74eb994057ab",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'vp_agency/agency_metrics'"
+ "'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'"
]
},
- "execution_count": 33,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "GTFS_DATA_DICT.rt_vs_schedule_tables.vp_agency_metrics"
+ "RT_SCHED_GCS"
]
},
{
"cell_type": "code",
- "execution_count": 8,
- "id": "386cab22-a872-4c9b-8eb4-970adede9c90",
+ "execution_count": 6,
+ "id": "a666f731-821a-4d7f-adc6-36ab7ee1428c",
"metadata": {},
"outputs": [],
"source": [
- "analysis_date = rt_dates.DATES[\"apr2024\"]"
+ "ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics"
]
},
{
"cell_type": "code",
- "execution_count": 9,
- "id": "d07f3469-8630-41e5-a85f-dfc6e8dd544d",
+ "execution_count": 7,
+ "id": "2d7af748-ceae-4a95-88e6-e24eb788a253",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "{'may2022': '2022-05-04',\n",
- " 'sep2022': '2022-09-14',\n",
- " 'sep2022a': '2022-09-21',\n",
- " 'oct2022': '2022-10-12',\n",
- " 'nov2022a': '2022-11-07',\n",
- " 'nov2022b': '2022-11-08',\n",
- " 'nov2022c': '2022-11-09',\n",
- " 'nov2022d': '2022-11-10',\n",
- " 'nov2022': '2022-11-16',\n",
- " 'mar2023': '2023-03-15',\n",
- " 'may2023': '2023-05-17',\n",
- " 'sep2023': '2023-09-13',\n",
- " 'oct2023a': '2023-10-09',\n",
- " 'oct2023b': '2023-10-10',\n",
- " 'oct2023': '2023-10-11',\n",
- " 'oct2023c': '2023-10-12',\n",
- " 'oct2023d': '2023-10-13',\n",
- " 'oct2023e': '2023-10-14',\n",
- " 'oct2023f': '2023-10-15',\n",
- " 'nov2023': '2023-11-15',\n",
- " 'mar2024': '2024-03-13',\n",
- " 'may2024': '2024-05-22',\n",
- " 'sep2024': '2024-09-18'}"
+ "'vp_route_dir/route_direction_metrics'"
]
},
- "execution_count": 9,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "{k: v for k, v in rt_dates.DATES.items() if (k[:3], k[3:]) >= ('mar', '2023')}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "bb39cfdf-43a3-46e4-b200-019eb08b2de3",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.read_parquet(f\"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "69d44f05-44b3-41f1-8d69-c5ddc7fb3dcd",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = df.loc[df[\"time_period\"] == \"all_day\"].reset_index(drop=True)"
+ "ROUTE_EXPORT"
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "9da30f17-f428-4dad-a889-e86b2ce605f8",
+ "execution_count": 8,
+ "id": "386cab22-a872-4c9b-8eb4-970adede9c90",
"metadata": {},
"outputs": [],
"source": [
- "groupby_cols = [\"caltrans_district\", \"organization_name\", \"schedule_gtfs_dataset_key\"]"
+ "analysis_date = \"2024-09-18\""
]
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "0d8b94cc-f6ee-4e91-b8c2-3f98048e81bf",
+ "execution_count": 9,
+ "id": "bb39cfdf-43a3-46e4-b200-019eb08b2de3",
"metadata": {},
"outputs": [],
"source": [
- "agg1 = (\n",
- " df.groupby(groupby_cols)\n",
- " .agg(\n",
- " {\n",
- " \"total_vp\": \"sum\",\n",
- " \"vp_in_shape\": \"sum\",\n",
- " \"total_rt_service_minutes\": \"sum\",\n",
- " }\n",
- " )\n",
- " .reset_index()\n",
- ")"
+ "df = pd.read_parquet(f\"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet\")"
]
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "45a50735-5253-4434-99d2-2feb28431bd4",
+ "execution_count": 10,
+ "id": "69d44f05-44b3-41f1-8d69-c5ddc7fb3dcd",
"metadata": {},
"outputs": [],
"source": [
- "agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(2)\n",
- "agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)"
+ "df = df.loc[df[\"time_period\"] == \"all_day\"].reset_index(drop=True)"
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "c886f721-ae63-4b23-8100-40fafc3587d4",
+ "execution_count": 11,
+ "id": "48a80998-a13b-4580-9778-ed67958a8e78",
"metadata": {},
"outputs": [
{
@@ -282,266 +207,237 @@
" \n",
" \n",
" | \n",
- " 3 | \n",
+ " 2275 | \n",
"
\n",
" \n",
" \n",
" \n",
- " caltrans_district | \n",
- " 01 - Eureka | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " cc53a0dbf5df90e3009b9cb5d89d80ba | \n",
"
\n",
" \n",
- " organization_name | \n",
- " Redwood Coast Transit Authority | \n",
+ " route_id | \n",
+ " 4867 | \n",
"
\n",
" \n",
- " schedule_gtfs_dataset_key | \n",
- " 090b30e4249a7ec2b4c6a0923ed2f953 | \n",
+ " direction_id | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " time_period | \n",
+ " all_day | \n",
+ "
\n",
+ " \n",
+ " minutes_atleast1_vp | \n",
+ " 1629 | \n",
+ "
\n",
+ " \n",
+ " minutes_atleast2_vp | \n",
+ " 1578 | \n",
+ "
\n",
+ " \n",
+ " total_rt_service_minutes | \n",
+ " 1627.50 | \n",
+ "
\n",
+ " \n",
+ " total_scheduled_service_minutes | \n",
+ " 1221.00 | \n",
"
\n",
" \n",
" total_vp | \n",
- " 7047 | \n",
+ " 4613 | \n",
"
\n",
" \n",
" vp_in_shape | \n",
- " 4746 | \n",
+ " 3491 | \n",
"
\n",
" \n",
- " total_rt_service_minutes | \n",
- " 2480.40 | \n",
+ " is_early | \n",
+ " 1 | \n",
"
\n",
" \n",
- " vp_per_min_agency | \n",
- " 2.84 | \n",
+ " is_ontime | \n",
+ " 5 | \n",
"
\n",
" \n",
- " spatial_accuracy_agency | \n",
- " 67.35 | \n",
+ " is_late | \n",
+ " 31 | \n",
"
\n",
- " \n",
- "\n",
- ""
- ],
- "text/plain": [
- " 3\n",
- "caltrans_district 01 - Eureka\n",
- "organization_name Redwood Coast Transit Authority\n",
- "schedule_gtfs_dataset_key 090b30e4249a7ec2b4c6a0923ed2f953\n",
- "total_vp 7047\n",
- "vp_in_shape 4746\n",
- "total_rt_service_minutes 2480.40\n",
- "vp_per_min_agency 2.84\n",
- "spatial_accuracy_agency 67.35"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "agg1.sample().T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "3985f03a-bcd5-41f9-bcff-a5f3a1436603",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " caltrans_district | \n",
+ "
\n",
+ " n_vp_trips | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " vp_per_minute | \n",
+ " 2.83 | \n",
+ "
\n",
+ " \n",
+ " pct_in_shape | \n",
+ " 0.76 | \n",
+ "
\n",
+ " \n",
+ " pct_rt_journey_atleast1_vp | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " pct_rt_journey_atleast2_vp | \n",
+ " 0.97 | \n",
+ "
\n",
+ " \n",
+ " pct_sched_journey_atleast1_vp | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " pct_sched_journey_atleast2_vp | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " rt_sched_journey_ratio | \n",
+ " 1.33 | \n",
+ "
\n",
+ " \n",
+ " avg_rt_service_minutes | \n",
+ " 43.99 | \n",
+ "
\n",
+ " \n",
+ " name | \n",
+ " LA DOT Schedule | \n",
+ "
\n",
+ " \n",
+ " schedule_source_record_id | \n",
+ " rec4C3jVlVMVmxiNr | \n",
+ "
\n",
+ " \n",
+ " base64_url | \n",
+ " aHR0cHM6Ly9sYWRvdGJ1cy5jb20vZ3Rmcw== | \n",
+ "
\n",
+ " \n",
+ " organization_source_record_id | \n",
+ " rec4pgjrmdhCh4z01 | \n",
+ "
\n",
+ " \n",
" organization_name | \n",
- " schedule_gtfs_dataset_key | \n",
- " total_vp | \n",
- " vp_in_shape | \n",
- " total_rt_service_minutes | \n",
- " vp_per_min_agency | \n",
- " spatial_accuracy_agency | \n",
+ " City of Los Angeles | \n",
"
\n",
- " \n",
- " \n",
" \n",
- " 0 | \n",
- " 01 - Eureka | \n",
- " City of Eureka | \n",
- " a253a8d7acd57657bb98050f37dd6b0f | \n",
- " 37981 | \n",
- " 18000 | \n",
- " 13102.61 | \n",
- " 2.90 | \n",
- " 47.39 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 01 - Eureka | \n",
- " Lake Transit Authority | \n",
- " 0a3c0b21c85fb09f8db91599e14dd7f7 | \n",
- " 13320 | \n",
- " 12772 | \n",
- " 5433.32 | \n",
- " 2.45 | \n",
- " 95.89 | \n",
+ " caltrans_district | \n",
+ " 07 - Los Angeles | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
- " caltrans_district organization_name schedule_gtfs_dataset_key \\\n",
- "0 01 - Eureka City of Eureka a253a8d7acd57657bb98050f37dd6b0f \n",
- "1 01 - Eureka Lake Transit Authority 0a3c0b21c85fb09f8db91599e14dd7f7 \n",
- "\n",
- " total_vp vp_in_shape total_rt_service_minutes vp_per_min_agency \\\n",
- "0 37981 18000 13102.61 2.90 \n",
- "1 13320 12772 5433.32 2.45 \n",
- "\n",
- " spatial_accuracy_agency \n",
- "0 47.39 \n",
- "1 95.89 "
+ " 2275\n",
+ "schedule_gtfs_dataset_key cc53a0dbf5df90e3009b9cb5d89d80ba\n",
+ "route_id 4867\n",
+ "direction_id 0.00\n",
+ "time_period all_day\n",
+ "minutes_atleast1_vp 1629\n",
+ "minutes_atleast2_vp 1578\n",
+ "total_rt_service_minutes 1627.50\n",
+ "total_scheduled_service_minutes 1221.00\n",
+ "total_vp 4613\n",
+ "vp_in_shape 3491\n",
+ "is_early 1\n",
+ "is_ontime 5\n",
+ "is_late 31\n",
+ "n_vp_trips 37\n",
+ "vp_per_minute 2.83\n",
+ "pct_in_shape 0.76\n",
+ "pct_rt_journey_atleast1_vp 1.00\n",
+ "pct_rt_journey_atleast2_vp 0.97\n",
+ "pct_sched_journey_atleast1_vp 1.00\n",
+ "pct_sched_journey_atleast2_vp 1.00\n",
+ "rt_sched_journey_ratio 1.33\n",
+ "avg_rt_service_minutes 43.99\n",
+ "name LA DOT Schedule\n",
+ "schedule_source_record_id rec4C3jVlVMVmxiNr\n",
+ "base64_url aHR0cHM6Ly9sYWRvdGJ1cy5jb20vZ3Rmcw==\n",
+ "organization_source_record_id rec4pgjrmdhCh4z01\n",
+ "organization_name City of Los Angeles\n",
+ "caltrans_district 07 - Los Angeles"
]
},
- "execution_count": 16,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "agg1.head(2)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b1d4b72a-e09d-41f0-bc13-c78e65bad8b0",
- "metadata": {},
- "source": [
- "### Functions "
+ "df.sample().T"
]
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "02a30975-d9d5-4174-8a5f-47c1e80970df",
- "metadata": {},
- "outputs": [],
- "source": [
- "def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:\n",
- " # start = datetime.datetime.now()\n",
- "\n",
- " ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics\n",
- " AGENCY_EXPORT = dict_inputs.vp_agency_metrics\n",
- "\n",
- " # Read in dataframe.\n",
- " df = pd.read_parquet(f\"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet\")\n",
- "\n",
- " # Keep only all_day.\n",
- " df = df.loc[df[\"time_period\"] == \"all_day\"].reset_index(drop=True)\n",
- "\n",
- " # Aggregate\n",
- " groupby_cols = [\n",
- " \"caltrans_district\",\n",
- " \"organization_name\",\n",
- " \"schedule_gtfs_dataset_key\",\n",
- " ]\n",
- "\n",
- " sum_cols = [\"total_vp\", \"vp_in_shape\", \"total_rt_service_minutes\"]\n",
- " agg1 = df.groupby(groupby_cols).agg({**{e: \"sum\" for e in sum_cols}}).reset_index()\n",
- "\n",
- " agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(\n",
- " 2\n",
- " )\n",
- " agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(\n",
- " 2\n",
- " )\n",
- "\n",
- " agg1 = agg1.drop(columns=sum_cols)\n",
- " # Save\n",
- " agg1.to_parquet(f\"{RT_SCHED_GCS}{AGENCY_EXPORT}_TEST_{analysis_date}.parquet\")\n",
- "\n",
- " # end = datetime.datetime.now()\n",
- " # logger.info(f\"agency aggregation {analysis_date}: {end - start}\")\n",
- "\n",
- " return agg1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "7696d284-c8cb-4739-8131-dc873933994e",
+ "execution_count": 12,
+ "id": "9da30f17-f428-4dad-a889-e86b2ce605f8",
"metadata": {},
"outputs": [],
"source": [
- "analysis_date2 = rt_dates.DATES[\"apr2024\"]"
+ "groupby_cols = [\n",
+ " \"caltrans_district\",\n",
+ " \"organization_name\",\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ "]"
]
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "815a5ad4-2422-44ed-86c2-2bd5c4eae693",
+ "execution_count": 13,
+ "id": "0d8b94cc-f6ee-4e91-b8c2-3f98048e81bf",
"metadata": {},
"outputs": [],
"source": [
- "dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables"
+ "agg1 = (\n",
+ " df.groupby(groupby_cols)\n",
+ " .agg(\n",
+ " {\n",
+ " \"total_vp\": \"sum\",\n",
+ " \"vp_in_shape\": \"sum\",\n",
+ " \"total_rt_service_minutes\": \"sum\",\n",
+ " }\n",
+ " )\n",
+ " .reset_index()\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 20,
- "id": "85a4d3ea-cf58-44e7-901f-59414416e092",
+ "execution_count": 14,
+ "id": "45a50735-5253-4434-99d2-2feb28431bd4",
"metadata": {},
"outputs": [],
"source": [
- "apr_df = agency_metrics(\n",
- " analysis_date2,\n",
- " dict_inputs,\n",
- ")"
+ "agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(2)\n",
+ "agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)"
]
},
{
"cell_type": "code",
- "execution_count": 21,
- "id": "0b62649f-15b6-459d-ab1d-b2627521abfe",
+ "execution_count": 15,
+ "id": "e7ff37d4-bb02-4ca1-af61-a71a01565322",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.8153321332404478"
+ "0.6517265362899927"
]
},
- "execution_count": 21,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "703396 / 862711"
+ "63718 / 97768"
]
},
{
"cell_type": "code",
- "execution_count": 22,
- "id": "82ce25f9-3088-4d10-8427-f0fdf4f8c05f",
+ "execution_count": 16,
+ "id": "c886f721-ae63-4b23-8100-40fafc3587d4",
"metadata": {},
"outputs": [
{
@@ -565,200 +461,139 @@
" \n",
" \n",
" | \n",
- " 12 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
" caltrans_district | \n",
- " 04 - Oakland | \n",
- "
\n",
- " \n",
" organization_name | \n",
- " Alameda-Contra Costa Transit District | \n",
- "
\n",
- " \n",
" schedule_gtfs_dataset_key | \n",
- " c499f905e33929a641f083dad55c521e | \n",
- "
\n",
- " \n",
+ " total_vp | \n",
+ " vp_in_shape | \n",
+ " total_rt_service_minutes | \n",
" vp_per_min_agency | \n",
- " 2.02 | \n",
- "
\n",
- " \n",
" spatial_accuracy_agency | \n",
- " 81.53 | \n",
- "
\n",
- " \n",
- "\n",
- ""
- ],
- "text/plain": [
- " 12\n",
- "caltrans_district 04 - Oakland\n",
- "organization_name Alameda-Contra Costa Transit District\n",
- "schedule_gtfs_dataset_key c499f905e33929a641f083dad55c521e\n",
- "vp_per_min_agency 2.02\n",
- "spatial_accuracy_agency 81.53"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "apr_df.loc[apr_df.organization_name == \"Alameda-Contra Costa Transit District\"].T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "d006206a-48e0-49b4-ba68-cd279fd7f0dc",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 42 | \n",
"
\n",
" \n",
" \n",
" \n",
- " caltrans_district | \n",
- " 05 - San Luis Obispo | \n",
- "
\n",
- " \n",
- " organization_name | \n",
- " Santa Cruz Metropolitan Transit District | \n",
- "
\n",
- " \n",
- " schedule_gtfs_dataset_key | \n",
- " 43d8d305ee692724a532f30ea63a1cbe | \n",
- "
\n",
- " \n",
- " vp_per_min_agency | \n",
- " 1.52 | \n",
- "
\n",
- " \n",
- " spatial_accuracy_agency | \n",
- " 94.49 | \n",
+ " 23 | \n",
+ " 04 - Oakland | \n",
+ " Marin County Transit District | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 97768 | \n",
+ " 63718 | \n",
+ " 36831.13 | \n",
+ " 2.65 | \n",
+ " 65.17 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
- " 42\n",
- "caltrans_district 05 - San Luis Obispo\n",
- "organization_name Santa Cruz Metropolitan Transit District\n",
- "schedule_gtfs_dataset_key 43d8d305ee692724a532f30ea63a1cbe\n",
- "vp_per_min_agency 1.52\n",
- "spatial_accuracy_agency 94.49"
+ " caltrans_district organization_name \\\n",
+ "23 04 - Oakland Marin County Transit District \n",
+ "\n",
+ " schedule_gtfs_dataset_key total_vp vp_in_shape \\\n",
+ "23 015d67d5b75b5cf2b710bbadadfb75f5 97768 63718 \n",
+ "\n",
+ " total_rt_service_minutes vp_per_min_agency spatial_accuracy_agency \n",
+ "23 36831.13 2.65 65.17 "
]
},
- "execution_count": 23,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "apr_df.sample().T"
+ "agg1.loc[agg1.organization_name == \"Marin County Transit District\"]"
]
},
{
"cell_type": "markdown",
- "id": "b3e76f4c-c933-490f-89bf-01369797e5b0",
+ "id": "f5682445-84eb-485d-bb70-6651fc576ead",
"metadata": {},
"source": [
- "### Look at the files"
+ "### Original"
]
},
{
"cell_type": "code",
- "execution_count": 24,
- "id": "2038f4d6-1d3e-4331-b08f-aa2812a6b749",
+ "execution_count": 17,
+ "id": "f1cf75d6-5fc0-45c5-a223-6bbe86b79992",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "RT_SCHED_GCS"
+ "organization_name = \"Marin County Transit District\""
]
},
{
"cell_type": "code",
- "execution_count": 25,
- "id": "cf1aab5e-f375-44a2-bddb-78830f29f762",
+ "execution_count": 18,
+ "id": "7997ef5b-b7b9-4ef8-9dd0-a6a40ea28ba6",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'vp_agency/agency_metrics'"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "dict_inputs.vp_agency_metrics"
+ "schd_vp_url = f\"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet\"\n",
+ "\n",
+ "# Keep only rows that are found in both schedule and real time data\n",
+ "vp_sched_df = pd.read_parquet(schd_vp_url)"
]
},
{
"cell_type": "code",
- "execution_count": 26,
- "id": "1fea8b05-c690-49c8-82c9-39c3f2f17e98",
+ "execution_count": 19,
+ "id": "c597a945-a110-472e-b0d6-7bec6ef91370",
"metadata": {},
"outputs": [],
"source": [
- "sept_df = pd.read_parquet(\"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_agency/agency_metrics_TEST_2024-09-18.parquet\")"
+ "vp_sched_df = vp_sched_df.loc[vp_sched_df[\"time_period\"] == \"all_day\"].reset_index(\n",
+ " drop=True\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 29,
- "id": "1699a6a0-223d-4667-b3b4-b7650387cb7f",
+ "execution_count": 20,
+ "id": "5f5a8bb7-ef34-46cf-9930-afa9ea2c664e",
"metadata": {},
"outputs": [],
"source": [
- "mar_df = pd.read_parquet(\"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_agency/agency_metrics_TEST_2024-03-13.parquet\")"
+ "schedule_and_vp_only = vp_sched_df.loc[vp_sched_df.sched_rt_category == \"schedule_and_vp\"]"
]
},
{
"cell_type": "code",
- "execution_count": 34,
- "id": "085868d8-ee1a-4849-b13a-27c90ac9f8ac",
+ "execution_count": 21,
+ "id": "7cb49efb-784a-482f-a739-f024ca3eb91f",
"metadata": {},
+ "outputs": [],
+ "source": [
+ "vp_sched_df2 = (\n",
+ " vp_sched_df.groupby(\n",
+ " [\n",
+ " \"caltrans_district\",\n",
+ " \"organization_name\",\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"service_date\",\n",
+ " ]\n",
+ " )\n",
+ " .agg(\n",
+ " {\n",
+ " \"total_vp\": \"sum\",\n",
+ " \"total_rt_service_minutes\": \"sum\",\n",
+ " \"vp_in_shape\": \"sum\",\n",
+ " }\n",
+ " )\n",
+ " .reset_index()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "c0538e01-a104-4f5b-aed0-8c023d23e060",
+ "metadata": {
+ "tags": []
+ },
"outputs": [
{
"data": {
@@ -784,127 +619,85 @@
" caltrans_district | \n",
" organization_name | \n",
" schedule_gtfs_dataset_key | \n",
- " vp_per_min_agency | \n",
- " spatial_accuracy_agency | \n",
+ " service_date | \n",
+ " total_vp | \n",
+ " total_rt_service_minutes | \n",
+ " vp_in_shape | \n",
" \n",
" \n",
" \n",
" \n",
- " 0 | \n",
- " 01 - Eureka | \n",
- " City of Eureka | \n",
- " a253a8d7acd57657bb98050f37dd6b0f | \n",
- " 2.90 | \n",
- " 96.56 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 01 - Eureka | \n",
- " Lake Transit Authority | \n",
- " 0a3c0b21c85fb09f8db91599e14dd7f7 | \n",
- " 2.44 | \n",
- " 96.45 | \n",
+ " 956 | \n",
+ " 04 - Oakland | \n",
+ " Marin County Transit District | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 2024-09-18 | \n",
+ " 97768 | \n",
+ " 36831.13 | \n",
+ " 63718 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " caltrans_district organization_name schedule_gtfs_dataset_key \\\n",
- "0 01 - Eureka City of Eureka a253a8d7acd57657bb98050f37dd6b0f \n",
- "1 01 - Eureka Lake Transit Authority 0a3c0b21c85fb09f8db91599e14dd7f7 \n",
+ " caltrans_district organization_name \\\n",
+ "956 04 - Oakland Marin County Transit District \n",
"\n",
- " vp_per_min_agency spatial_accuracy_agency \n",
- "0 2.90 96.56 \n",
- "1 2.44 96.45 "
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "mar_df.head(2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "ea934f21-c2e4-42dd-be5a-32b740008ba2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'vp_agency/agency_metrics_TEST_'"
+ " schedule_gtfs_dataset_key service_date total_vp \\\n",
+ "956 015d67d5b75b5cf2b710bbadadfb75f5 2024-09-18 97768 \n",
+ "\n",
+ " total_rt_service_minutes vp_in_shape \n",
+ "956 36831.13 63718 "
]
},
- "execution_count": 38,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "f\"{GTFS_DATA_DICT.rt_vs_schedule_tables.vp_agency_metrics}_TEST_\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "f4881744-f5dc-4a7a-9203-88ff661741ca",
- "metadata": {},
- "outputs": [],
- "source": [
- "sort_cols = [\"schedule_gtfs_dataset_key\", \"service_date\"]"
+ "vp_sched_df2.loc[\n",
+ " (vp_sched_df2.organization_name == organization_name)\n",
+ " & (vp_sched_df2.service_date == \"2024-09-18\")\n",
+ "]"
]
},
{
"cell_type": "code",
- "execution_count": 43,
- "id": "f76206d8-23b8-462f-8e61-9f839b12eeb7",
- "metadata": {},
- "outputs": [],
- "source": [
- "def concatenate_agency_level_metrics(\n",
- " date_list: list\n",
- ") -> pd.DataFrame:\n",
- " FILE = f\"{GTFS_DATA_DICT.rt_vs_schedule_tables.vp_agency_metrics}_TEST\"\n",
- " \n",
- " df = time_series_utils.concatenate_datasets_across_dates(\n",
- " RT_SCHED_GCS,\n",
- " FILE,\n",
- " date_list,\n",
- " data_type = \"df\",\n",
- " ).sort_values(sort_cols).reset_index(drop=True)\n",
- " \n",
- " return df\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "id": "db75c693-2bd5-4631-9fe9-b0e302b89abf",
+ "execution_count": 23,
+ "id": "ac36b3db-447b-4c21-abe3-0cbb3f35ec68",
"metadata": {},
"outputs": [],
"source": [
- "analysis_date_list = rt_dates.y2024_dates"
+ "vp_sched_df3 = (\n",
+ " schedule_and_vp_only.groupby([\"caltrans_district\", \"organization_name\", \"service_date\"])\n",
+ " .agg(\n",
+ " {\n",
+ " \"total_vp\": \"sum\",\n",
+ " \"total_rt_service_minutes\": \"sum\",\n",
+ " \"vp_in_shape\": \"sum\",\n",
+ " }\n",
+ " )\n",
+ " .reset_index()\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 45,
- "id": "39f4dcc0-ead6-46e7-8e5a-4eebf5d03544",
+ "execution_count": 24,
+ "id": "dd41cecb-69f1-4ea6-90c8-6a6118b63c41",
"metadata": {},
"outputs": [],
"source": [
- "final_df = concatenate_agency_level_metrics(analysis_date_list)"
+ "vp_sched_df3[\"vp_per_min_agency\"] = ((vp_sched_df3.total_vp / vp_sched_df3.total_rt_service_minutes)).round(2)\n",
+ "vp_sched_df3[\"spatial_accuracy_agency\"] = ((vp_sched_df3.vp_in_shape / vp_sched_df3.total_vp) * 100).round(2)"
]
},
{
"cell_type": "code",
- "execution_count": 46,
- "id": "5ade9cb8-893b-4e27-8151-9abc31ea60c2",
+ "execution_count": 25,
+ "id": "e32a1f49-5483-466c-a208-1db84a2d0e70",
"metadata": {},
"outputs": [
{
@@ -928,58 +721,89 @@
" \n",
" \n",
" | \n",
- " caltrans_district | \n",
- " organization_name | \n",
- " schedule_gtfs_dataset_key | \n",
- " vp_per_min_agency | \n",
- " spatial_accuracy_agency | \n",
- " service_date | \n",
+ " 477 | \n",
"
\n",
" \n",
" \n",
" \n",
- " 0 | \n",
+ " caltrans_district | \n",
" 04 - Oakland | \n",
- " Marin County Transit District | \n",
- " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
- " 2.60 | \n",
- " 90.88 | \n",
- " 2024-01-17 | \n",
"
\n",
" \n",
- " 1 | \n",
- " 04 - Oakland | \n",
+ " organization_name | \n",
" Marin County Transit District | \n",
- " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
- " 2.68 | \n",
- " 90.43 | \n",
- " 2024-02-14 | \n",
+ "
\n",
+ " \n",
+ " service_date | \n",
+ " 2024-09-18 00:00:00 | \n",
+ "
\n",
+ " \n",
+ " total_vp | \n",
+ " 67420 | \n",
+ "
\n",
+ " \n",
+ " total_rt_service_minutes | \n",
+ " 25282.30 | \n",
+ "
\n",
+ " \n",
+ " vp_in_shape | \n",
+ " 61736 | \n",
+ "
\n",
+ " \n",
+ " vp_per_min_agency | \n",
+ " 2.67 | \n",
+ "
\n",
+ " \n",
+ " spatial_accuracy_agency | \n",
+ " 91.57 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " caltrans_district organization_name \\\n",
- "0 04 - Oakland Marin County Transit District \n",
- "1 04 - Oakland Marin County Transit District \n",
- "\n",
- " schedule_gtfs_dataset_key vp_per_min_agency \\\n",
- "0 015d67d5b75b5cf2b710bbadadfb75f5 2.60 \n",
- "1 015d67d5b75b5cf2b710bbadadfb75f5 2.68 \n",
- "\n",
- " spatial_accuracy_agency service_date \n",
- "0 90.88 2024-01-17 \n",
- "1 90.43 2024-02-14 "
+ " 477\n",
+ "caltrans_district 04 - Oakland\n",
+ "organization_name Marin County Transit District\n",
+ "service_date 2024-09-18 00:00:00\n",
+ "total_vp 67420\n",
+ "total_rt_service_minutes 25282.30\n",
+ "vp_in_shape 61736\n",
+ "vp_per_min_agency 2.67\n",
+ "spatial_accuracy_agency 91.57"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vp_sched_df3.loc[\n",
+ " (vp_sched_df3.organization_name == organization_name)\n",
+ " & (vp_sched_df3.service_date == \"2024-09-18\")\n",
+ "].T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "08442ae2-cb81-493c-8ed8-839b70c26780",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.6517265362899927"
]
},
- "execution_count": 46,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "final_df.head(2)"
+ "63718 / 97768"
]
}
],
diff --git a/rt_scheduled_v_ran/12_agency_agg2.ipynb b/rt_scheduled_v_ran/12_agency_agg2.ipynb
new file mode 100644
index 000000000..0e03a0da2
--- /dev/null
+++ b/rt_scheduled_v_ran/12_agency_agg2.ipynb
@@ -0,0 +1,634 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "257a475b-714f-4000-9f2e-0376a3951acd",
+ "metadata": {},
+ "source": [
+ "## Agency Grain Metrics\n",
+ "* Starting from `vp_trips` this time. \n",
+ "* Add it to the pipeline in `rt_scheduled_v_ran/scripts/rt_v_scheduled_agency.py`\n",
+ "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "343294c0-3ae5-440a-92fc-43d20408b701",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import geopandas as gpd\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from segment_speed_utils import (\n",
+ " gtfs_schedule_wrangling,\n",
+ " helpers,\n",
+ " metrics,\n",
+ " time_series_utils,\n",
+ ")\n",
+ "from segment_speed_utils.project_vars import (\n",
+ " COMPILED_CACHED_VIEWS,\n",
+ " GTFS_DATA_DICT,\n",
+ " PROJECT_CRS,\n",
+ " RT_SCHED_GCS,\n",
+ " SCHED_GCS,\n",
+ " SEGMENT_GCS,\n",
+ ")\n",
+ "from shared_utils import catalog_utils, rt_dates, rt_utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "04beb077-3365-4290-a933-12a0ce750e53",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.options.display.max_columns = 100\n",
+ "pd.options.display.float_format = \"{:.2f}\".format\n",
+ "pd.set_option(\"display.max_rows\", None)\n",
+ "pd.set_option(\"display.max_colwidth\", None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "dbfba5e1-ad13-46bb-9921-74762549d9e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "ee546f20-6c03-459c-ac6f-61738f66a895",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'dir': '${gcs_paths.RT_SCHED_GCS}', 'stop_times_direction': 'stop_times_direction', 'sched_trip_metrics': 'schedule_trip/schedule_trip_metrics', 'sched_route_direction_metrics': 'schedule_route_dir/schedule_route_direction_metrics', 'vp_trip_metrics': 'vp_trip/trip_metrics', 'vp_route_direction_metrics': 'vp_route_dir/route_direction_metrics', 'vp_operator_metrics': 'vp_operator/operator_metrics', 'schedule_rt_stop_times': 'schedule_rt_stop_times', 'early_trip_minutes': -5, 'late_trip_minutes': 5}"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dict_inputs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "0b3cfc74-2a10-434c-84ae-d723ad6396d0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['2024-01-17',\n",
+ " '2024-02-14',\n",
+ " '2024-03-13',\n",
+ " '2024-04-17',\n",
+ " '2024-05-22',\n",
+ " '2024-06-12',\n",
+ " '2024-07-17',\n",
+ " '2024-08-14',\n",
+ " '2024-09-18']"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rt_dates.y2024_dates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8e7e816e-6a65-46a0-941d-e4231289e203",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "TRIP_EXPORT = dict_inputs.vp_trip_metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "c3bf9359-318b-406c-b668-278c128e5292",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "crosswalk_cols = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"name\",\n",
+ " \"organization_name\",\n",
+ " \"caltrans_district\",]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "2a51515c-37d0-4c0a-8cb4-e265d1434755",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "analysis_date = \"2024-09-18\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "8d2be580-0e06-43a6-8c34-ce8fd06ea2e9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_parquet(f\"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "c4c2e26b-9878-4566-98d8-1ff71237617e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = gtfs_schedule_wrangling.merge_operator_identifiers(\n",
+ " df,\n",
+ " [analysis_date],\n",
+ " columns = crosswalk_cols)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "76a66216-0acc-4f89-b940-aa8d89bf476c",
+ "metadata": {},
+ "source": [
+ "### What time of day do I use?\n",
+ "`df.loc[df[\"time_period\"] == \"all_day\"]` is not available."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "3eef5b7d-5b36-459c-b1d8-b2f953ab54e6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['AM Peak', 'Evening', 'PM Peak', 'Early AM', 'Midday', 'Owl'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.time_of_day.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "fdea269f-50d9-47b4-ba4c-eaae7c5fd06c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['peak', 'offpeak'], dtype=object)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.peak_offpeak.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "8e00f7f5-49c7-4a29-90b0-fe924a40c01d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " trip_instance_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " scheduled_service_minutes | \n",
+ " total_vp | \n",
+ " rt_service_minutes | \n",
+ " minutes_atleast1_vp | \n",
+ " minutes_atleast2_vp | \n",
+ " vp_in_shape | \n",
+ " sched_rt_category | \n",
+ " time_of_day | \n",
+ " peak_offpeak | \n",
+ " vp_per_minute | \n",
+ " pct_in_shape | \n",
+ " pct_rt_journey_atleast1_vp | \n",
+ " pct_rt_journey_atleast2_vp | \n",
+ " pct_sched_journey_atleast1_vp | \n",
+ " pct_sched_journey_atleast2_vp | \n",
+ " rt_sched_journey_difference | \n",
+ " is_early | \n",
+ " is_ontime | \n",
+ " is_late | \n",
+ " name | \n",
+ " organization_name | \n",
+ " caltrans_district | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7cc0cb1871dfd558f11a2885c145d144 | \n",
+ " 000213c9d5753f9565b679d8ab84929f | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 29.00 | \n",
+ " 200 | \n",
+ " 66.25 | \n",
+ " 68 | \n",
+ " 66 | \n",
+ " 195 | \n",
+ " schedule_and_vp | \n",
+ " AM Peak | \n",
+ " peak | \n",
+ " 3.02 | \n",
+ " 0.97 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 37.25 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Bay Area 511 Muni Schedule | \n",
+ " City and County of San Francisco | \n",
+ " 04 - Oakland | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key trip_instance_key \\\n",
+ "0 7cc0cb1871dfd558f11a2885c145d144 000213c9d5753f9565b679d8ab84929f \n",
+ "\n",
+ " route_id direction_id scheduled_service_minutes total_vp \\\n",
+ "0 2 1.00 29.00 200 \n",
+ "\n",
+ " rt_service_minutes minutes_atleast1_vp minutes_atleast2_vp vp_in_shape \\\n",
+ "0 66.25 68 66 195 \n",
+ "\n",
+ " sched_rt_category time_of_day peak_offpeak vp_per_minute pct_in_shape \\\n",
+ "0 schedule_and_vp AM Peak peak 3.02 0.97 \n",
+ "\n",
+ " pct_rt_journey_atleast1_vp pct_rt_journey_atleast2_vp \\\n",
+ "0 1.00 1.00 \n",
+ "\n",
+ " pct_sched_journey_atleast1_vp pct_sched_journey_atleast2_vp \\\n",
+ "0 1.00 1.00 \n",
+ "\n",
+ " rt_sched_journey_difference is_early is_ontime is_late \\\n",
+ "0 37.25 0 0 1 \n",
+ "\n",
+ " name organization_name \\\n",
+ "0 Bay Area 511 Muni Schedule City and County of San Francisco \n",
+ "\n",
+ " caltrans_district \n",
+ "0 04 - Oakland "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "7a575da4-2efc-44f4-9c56-9d3277d806a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "groupby_cols = [\n",
+ " \"caltrans_district\",\n",
+ " \"organization_name\",\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "42c7e971-23e7-40b9-9386-47c5b1b47b9d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agg1 = (\n",
+ " df2.groupby(groupby_cols)\n",
+ " .agg(\n",
+ " {\n",
+ " \"total_vp\": \"sum\",\n",
+ " \"vp_in_shape\": \"sum\",\n",
+ " \"rt_service_minutes\": \"sum\",\n",
+ " }\n",
+ " )\n",
+ " .reset_index()\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "a738e9fd-143a-497c-b7af-a0884e7346d7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "agg1[\"vp_per_min_agency\"] = ((agg1.total_vp / agg1.rt_service_minutes)).round(2)\n",
+ "agg1[\"spatial_accuracy_agency\"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "c0c61998-c937-432e-bcf3-4850eb300c60",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " caltrans_district | \n",
+ " organization_name | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " total_vp | \n",
+ " vp_in_shape | \n",
+ " rt_service_minutes | \n",
+ " vp_per_min_agency | \n",
+ " spatial_accuracy_agency | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 01 - Eureka | \n",
+ " City of Arcata | \n",
+ " a253a8d7acd57657bb98050f37dd6b0f | \n",
+ " 38142 | \n",
+ " 17935 | \n",
+ " 13367.50 | \n",
+ " 2.85 | \n",
+ " 47.02 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 01 - Eureka | \n",
+ " City of Eureka | \n",
+ " a253a8d7acd57657bb98050f37dd6b0f | \n",
+ " 38142 | \n",
+ " 17935 | \n",
+ " 13367.50 | \n",
+ " 2.85 | \n",
+ " 47.02 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 01 - Eureka | \n",
+ " Humboldt Transit Authority | \n",
+ " a253a8d7acd57657bb98050f37dd6b0f | \n",
+ " 38142 | \n",
+ " 17935 | \n",
+ " 13367.50 | \n",
+ " 2.85 | \n",
+ " 47.02 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 01 - Eureka | \n",
+ " Lake Transit Authority | \n",
+ " 0a3c0b21c85fb09f8db91599e14dd7f7 | \n",
+ " 11572 | \n",
+ " 11223 | \n",
+ " 5015.75 | \n",
+ " 2.31 | \n",
+ " 96.98 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 01 - Eureka | \n",
+ " Mendocino Transit Authority | \n",
+ " 770072d7a8d356b529ef34fe01715bcb | \n",
+ " 16196 | \n",
+ " 13702 | \n",
+ " 6699.27 | \n",
+ " 2.42 | \n",
+ " 84.60 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " caltrans_district organization_name \\\n",
+ "0 01 - Eureka City of Arcata \n",
+ "1 01 - Eureka City of Eureka \n",
+ "2 01 - Eureka Humboldt Transit Authority \n",
+ "3 01 - Eureka Lake Transit Authority \n",
+ "4 01 - Eureka Mendocino Transit Authority \n",
+ "\n",
+ " schedule_gtfs_dataset_key total_vp vp_in_shape \\\n",
+ "0 a253a8d7acd57657bb98050f37dd6b0f 38142 17935 \n",
+ "1 a253a8d7acd57657bb98050f37dd6b0f 38142 17935 \n",
+ "2 a253a8d7acd57657bb98050f37dd6b0f 38142 17935 \n",
+ "3 0a3c0b21c85fb09f8db91599e14dd7f7 11572 11223 \n",
+ "4 770072d7a8d356b529ef34fe01715bcb 16196 13702 \n",
+ "\n",
+ " rt_service_minutes vp_per_min_agency spatial_accuracy_agency \n",
+ "0 13367.50 2.85 47.02 \n",
+ "1 13367.50 2.85 47.02 \n",
+ "2 13367.50 2.85 47.02 \n",
+ "3 5015.75 2.31 96.98 \n",
+ "4 6699.27 2.42 84.60 "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "agg1.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "ee228e7c-6479-4b9f-be77-367f50ea46e7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " caltrans_district | \n",
+ " 04 - Oakland | \n",
+ "
\n",
+ " \n",
+ " organization_name | \n",
+ " Marin County Transit District | \n",
+ "
\n",
+ " \n",
+ " schedule_gtfs_dataset_key | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ "
\n",
+ " \n",
+ " total_vp | \n",
+ " 97768 | \n",
+ "
\n",
+ " \n",
+ " vp_in_shape | \n",
+ " 63718 | \n",
+ "
\n",
+ " \n",
+ " rt_service_minutes | \n",
+ " 36831.13 | \n",
+ "
\n",
+ " \n",
+ " vp_per_min_agency | \n",
+ " 2.65 | \n",
+ "
\n",
+ " \n",
+ " spatial_accuracy_agency | \n",
+ " 65.17 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 28\n",
+ "caltrans_district 04 - Oakland\n",
+ "organization_name Marin County Transit District\n",
+ "schedule_gtfs_dataset_key 015d67d5b75b5cf2b710bbadadfb75f5\n",
+ "total_vp 97768\n",
+ "vp_in_shape 63718\n",
+ "rt_service_minutes 36831.13\n",
+ "vp_per_min_agency 2.65\n",
+ "spatial_accuracy_agency 65.17"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "agg1.loc[agg1.organization_name == \"Marin County Transit District\"].T"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py b/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py
index 0e6a2555e..25a0d2488 100644
--- a/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py
+++ b/rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py
@@ -12,18 +12,27 @@
from update_vars import RT_SCHED_GCS, GTFS_DATA_DICT
from shared_utils import rt_dates
-def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:
+def operator_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:
start = datetime.datetime.now()
- ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics
- AGENCY_EXPORT = dict_inputs.vp_agency_metrics
+ TRIP_EXPORT = dict_inputs.vp_trip_metrics
+ OP_EXPORT = dict_inputs.vp_operator_metrics
# Read in dataframe.
- df = pd.read_parquet(f"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet")
-
- # Keep only all_day.
- df = df.loc[df["time_period"] == "all_day"].reset_index(drop=True)
-
+ df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet")
+
+ # Merge in identifiers
+ crosswalk_cols = [
+ "schedule_gtfs_dataset_key",
+ "name",
+ "organization_name",
+ "caltrans_district",]
+
+ df2 = gtfs_schedule_wrangling.merge_operator_identifiers(
+ df,
+ [analysis_date],
+ columns = crosswalk_cols)
+
# Aggregate
groupby_cols = [
"caltrans_district",
@@ -31,21 +40,17 @@ def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:
"schedule_gtfs_dataset_key",
]
- sum_cols = ["total_vp", "vp_in_shape", "total_rt_service_minutes"]
- agg1 = df.groupby(groupby_cols).agg({**{e: "sum" for e in sum_cols}}).reset_index()
+ sum_cols = ["total_vp", "vp_in_shape", "rt_service_minutes"]
+ agg1 = df2.groupby(groupby_cols).agg({**{e: "sum" for e in sum_cols}}).reset_index()
- agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(
- 2
- )
- agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(
- 2
- )
+ agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.rt_service_minutes)).round(2)
+ agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)
- # Cleanrt_V
+ # Clean
agg1 = agg1.drop(columns=sum_cols)
- # Save: take out test later
- agg1.to_parquet(f"{RT_SCHED_GCS}{AGENCY_EXPORT}_TEST_{analysis_date}.parquet")
+ # Save
+ agg1.to_parquet(f"{RT_SCHED_GCS}{OP_EXPORT}_{analysis_date}.parquet")
end = datetime.datetime.now()
logger.info(f"agency aggregation {analysis_date}: {end - start}")
@@ -54,7 +59,7 @@ def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:
if __name__ == "__main__":
- LOG_FILE = "../logs/rt_v_scheduled_agency_metrics.log"
+ LOG_FILE = "../logs/rt_v_scheduled_operator_metrics.log"
logger.add(LOG_FILE, retention="3 months")
logger.add(sys.stderr,
format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}",
@@ -65,4 +70,4 @@ def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables
for analysis_date in analysis_date_list:
- agency_metrics(analysis_date, dict_inputs)
\ No newline at end of file
+ operator_metrics(analysis_date, dict_inputs)
\ No newline at end of file
diff --git a/rt_scheduled_v_ran/scripts/update_vars.py b/rt_scheduled_v_ran/scripts/update_vars.py
index 455bbd82b..1f8daf291 100644
--- a/rt_scheduled_v_ran/scripts/update_vars.py
+++ b/rt_scheduled_v_ran/scripts/update_vars.py
@@ -6,7 +6,7 @@
apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True)
# analysis_date_list = [rt_dates.DATES["sep2024"]]
-analysis_date_list = rt_dates.y2024_dates
+analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")