diff --git a/gtfs_digest/37_transit_bunching_samples.ipynb b/gtfs_digest/37_transit_bunching_samples.ipynb
index ce4ab095b..a882c0b66 100644
--- a/gtfs_digest/37_transit_bunching_samples.ipynb
+++ b/gtfs_digest/37_transit_bunching_samples.ipynb
@@ -41142,7 +41142,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/tmp/ipykernel_2446/3770378478.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n",
+ "/tmp/ipykernel_2867/3770378478.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n",
" trips_routes_times2.converted_schd_arrival.describe()\n"
]
},
@@ -41463,7 +41463,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/tmp/ipykernel_2446/452450045.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n",
+ "/tmp/ipykernel_2867/452450045.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n",
" trips_routes_times2.converted_rt_arrival.describe()\n"
]
},
@@ -41839,14 +41839,14 @@
" \n",
"
\n",
" \n",
- " 7746 | \n",
+ " 10317 | \n",
" 0666caf3ec1ecc96b74f4477ee4bc939 | \n",
" 33-13172 | \n",
- " 6935 | \n",
- " 20 | \n",
- " 2024-05-22 06:26:29 | \n",
+ " 4690 | \n",
+ " 21 | \n",
+ " 2024-05-22 00:24:04 | \n",
" NaN | \n",
- " 2024-05-22 06:25:00 | \n",
+ " 2024-05-22 00:25:00 | \n",
" NaN | \n",
"
\n",
" \n",
@@ -41854,14 +41854,14 @@
""
],
"text/plain": [
- " schedule_gtfs_dataset_key route_id stop_id stop_sequence \\\n",
- "7746 0666caf3ec1ecc96b74f4477ee4bc939 33-13172 6935 20 \n",
+ " schedule_gtfs_dataset_key route_id stop_id stop_sequence \\\n",
+ "10317 0666caf3ec1ecc96b74f4477ee4bc939 33-13172 4690 21 \n",
"\n",
- " converted_rt_arrival actual_arrival_lag_min converted_schd_arrival \\\n",
- "7746 2024-05-22 06:26:29 NaN 2024-05-22 06:25:00 \n",
+ " converted_rt_arrival actual_arrival_lag_min converted_schd_arrival \\\n",
+ "10317 2024-05-22 00:24:04 NaN 2024-05-22 00:25:00 \n",
"\n",
- " scheduled_arrival_lag_min \n",
- "7746 NaN "
+ " scheduled_arrival_lag_min \n",
+ "10317 NaN "
]
},
"execution_count": 73,
@@ -50608,23 +50608,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "test1 = compare_approaches(\n",
+ " stop_id=\"5685\",\n",
+ " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n",
+ " route_id=\"204-13172\",\n",
+ " stop_sequence=46,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bddf4717-b901-4ee0-a9a1-46df6f9a04e3",
+ "metadata": {},
+ "source": [
+ "### Something going wrong for `all_trips` once I aggregate for all operators."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "8af00aab-c5cd-48c7-babb-1d45f2ecdc8b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " caltrans_district | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " feed_key | \n",
+ " organization_name | \n",
+ " route_long_name | \n",
+ " route_type | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " stop_id | \n",
+ " stop_sequence | \n",
+ " all_trips | \n",
+ " per_trip_bunched_per_stop | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 47535 | \n",
+ " 07 - Los Angeles | \n",
+ " 0666caf3ec1ecc96b74f4477ee4bc939 | \n",
+ " 608992664173210532aa3e6cc573be2f | \n",
+ " Los Angeles County Metropolitan Transportation Authority | \n",
+ " Metro Local Line | \n",
+ " Bus | \n",
+ " 33-13172 | \n",
+ " 1.00 | \n",
+ " 3104 | \n",
+ " 80 | \n",
+ " 30.00 | \n",
+ " 0.10 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " caltrans_district schedule_gtfs_dataset_key \\\n",
+ "47535 07 - Los Angeles 0666caf3ec1ecc96b74f4477ee4bc939 \n",
+ "\n",
+ " feed_key \\\n",
+ "47535 608992664173210532aa3e6cc573be2f \n",
+ "\n",
+ " organization_name \\\n",
+ "47535 Los Angeles County Metropolitan Transportation Authority \n",
+ "\n",
+ " route_long_name route_type route_id direction_id stop_id \\\n",
+ "47535 Metro Local Line Bus 33-13172 1.00 3104 \n",
+ "\n",
+ " stop_sequence all_trips per_trip_bunched_per_stop \n",
+ "47535 80 30.00 0.10 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " caltrans_district | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " feed_key | \n",
+ " organization_name | \n",
+ " route_long_name | \n",
+ " route_type | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " stop_id | \n",
+ " stop_sequence | \n",
+ " all_trips | \n",
+ " per_trip_bunched_per_stop | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 55965 | \n",
+ " 07 - Los Angeles | \n",
+ " 0666caf3ec1ecc96b74f4477ee4bc939 | \n",
+ " 608992664173210532aa3e6cc573be2f | \n",
+ " Los Angeles County Metropolitan Transportation Authority | \n",
+ " Metro Local Line | \n",
+ " Bus | \n",
+ " 33-13172 | \n",
+ " 1.00 | \n",
+ " 3104 | \n",
+ " 80 | \n",
+ " 53.00 | \n",
+ " 0.51 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " caltrans_district schedule_gtfs_dataset_key \\\n",
+ "55965 07 - Los Angeles 0666caf3ec1ecc96b74f4477ee4bc939 \n",
+ "\n",
+ " feed_key \\\n",
+ "55965 608992664173210532aa3e6cc573be2f \n",
+ "\n",
+ " organization_name \\\n",
+ "55965 Los Angeles County Metropolitan Transportation Authority \n",
+ "\n",
+ " route_long_name route_type route_id direction_id stop_id \\\n",
+ "55965 Metro Local Line Bus 33-13172 1.00 3104 \n",
+ "\n",
+ " stop_sequence all_trips per_trip_bunched_per_stop \n",
+ "55965 80 53.00 0.51 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "27"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "test3 = compare_approaches(\n",
+ " stop_id=\"3104\",\n",
+ " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n",
+ " route_id=\"33-13172\",\n",
+ " stop_sequence=80,\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/gtfs_digest/43_transit_bunching.ipynb b/gtfs_digest/43_transit_bunching.ipynb
new file mode 100644
index 000000000..2884d59d4
--- /dev/null
+++ b/gtfs_digest/43_transit_bunching.ipynb
@@ -0,0 +1,2540 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "733e5c07-e894-48de-b92a-9cba10b7fc9a",
+ "metadata": {},
+ "source": [
+ "## Transit Bunching \n",
+ "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`\n",
+ "* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)\n",
+ "### 11/8\n",
+ "* Figure out how to address City of Visalia: one of the buses that is scheduled to arrive earlier arrives later than another bus. \n",
+ "* This leads to a negative time stamp and makes it appear like there is a lot of bunching per the Transit Matters approach.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "60097cf1-857d-4c7f-9fc7-043a69ec1a61",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import datetime as dt\n",
+ "\n",
+ "import altair as alt\n",
+ "import geopandas as gpd\n",
+ "import merge_data\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils\n",
+ "from shared_utils import catalog_utils, rt_dates, rt_utils\n",
+ "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS\n",
+ "\n",
+ "# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml\n",
+ "GTFS_DATA_DICT = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n",
+ "\n",
+ "from segment_speed_utils.project_vars import (\n",
+ " COMPILED_CACHED_VIEWS,\n",
+ " GTFS_DATA_DICT,\n",
+ " PROJECT_CRS,\n",
+ " RT_SCHED_GCS,\n",
+ " SCHED_GCS,\n",
+ " SEGMENT_GCS,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "51a085f8-5981-4da7-904f-6348cf2e18b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.options.display.max_columns = 100\n",
+ "pd.options.display.float_format = \"{:.2f}\".format\n",
+ "pd.set_option(\"display.max_rows\", None)\n",
+ "pd.set_option(\"display.max_colwidth\", None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "113bd786-8d4b-4153-939b-b419b4fa97ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "may_date = \"2024-05-22\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "44a80420-dbe8-4a2b-9c47-44b32bf28e00",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "drop_for_preview = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"trip_instance_key\",\n",
+ " \"shape_array_key\",\n",
+ " \"feed_key\",\n",
+ " \"trip_id\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2be7e7cc-b6c5-4c87-b2d5-6cb8612e23ce",
+ "metadata": {},
+ "source": [
+ "### Grab Sample Routes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "873bf893-40a2-4b64-84ba-68604c6df18b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "subset = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " \"route_primary_direction\",\n",
+ " \"service_date\",\n",
+ " \"frequency\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "c9c35b69-d5ad-47b6-9027-33cc6631835e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'schedule_route_dir/schedule_route_direction_metrics'"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "85ad8696-1b70-4996-8866-3cd2ad3d3738",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_dir_columns = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " \"time_period\",\n",
+ " \"route_primary_direction\",\n",
+ " \"frequency\",\n",
+ " \"service_date\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "9d07da3c-e347-4671-925b-e8f5bc31c0fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[\n",
+ " route_dir_columns\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "260b6305-9e99-4870-82f7-5cae925d9a0d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " time_period | \n",
+ " route_primary_direction | \n",
+ " frequency | \n",
+ " service_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " Northbound | \n",
+ " 0.92 | \n",
+ " 2024-05-22 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " Northbound | \n",
+ " 0.62 | \n",
+ " 2024-05-22 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " Northbound | \n",
+ " 1.50 | \n",
+ " 2024-05-22 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 1.00 | \n",
+ " all_day | \n",
+ " Southbound | \n",
+ " 0.92 | \n",
+ " 2024-05-22 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 1.00 | \n",
+ " offpeak | \n",
+ " Southbound | \n",
+ " 0.69 | \n",
+ " 2024-05-22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id time_period \\\n",
+ "0 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n",
+ "1 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 offpeak \n",
+ "2 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 peak \n",
+ "3 015d67d5b75b5cf2b710bbadadfb75f5 17 1.00 all_day \n",
+ "4 015d67d5b75b5cf2b710bbadadfb75f5 17 1.00 offpeak \n",
+ "\n",
+ " route_primary_direction frequency service_date \n",
+ "0 Northbound 0.92 2024-05-22 \n",
+ "1 Northbound 0.62 2024-05-22 \n",
+ "2 Northbound 1.50 2024-05-22 \n",
+ "3 Southbound 0.92 2024-05-22 \n",
+ "4 Southbound 0.69 2024-05-22 "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "route_dir.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "84741559-46a2-4a62-a6e2-8843771aea1f",
+ "metadata": {},
+ "source": [
+ "#### Attach operators and districts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "d35238bb-2418-466b-8814-96382abeb3eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Grab Crosswalk\n",
+ "CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "6272b8aa-0d76-4e18-bc76-66548f54c9a0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "crosswalk_cols = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"organization_name\",\n",
+ " \"name\",\n",
+ " \"caltrans_district\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "d5d62df4-cffb-4bc8-9ad1-9766a8ec2bf1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "crosswalk_df = (\n",
+ " time_series_utils.concatenate_datasets_across_dates(\n",
+ " SCHED_GCS, CROSSWALK, [may_date], data_type=\"df\", columns=crosswalk_cols\n",
+ " )\n",
+ " .sort_values([\"service_date\"])\n",
+ " .reset_index(drop=True)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "4666bf1d-6456-49d6-955a-5ab77556af15",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " organization_name | \n",
+ " name | \n",
+ " caltrans_district | \n",
+ " service_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1770249a5a2e770ca90628434d4934b1 | \n",
+ " Ventura County Transportation Commission | \n",
+ " VCTC GMV Schedule | \n",
+ " 07 - Los Angeles | \n",
+ " 2024-05-22 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " f8102a9c0693206bf36d302540bf1bcf | \n",
+ " City of Corona | \n",
+ " Corona Schedule | \n",
+ " 08 - San Bernardino | \n",
+ " 2024-05-22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key organization_name \\\n",
+ "0 1770249a5a2e770ca90628434d4934b1 Ventura County Transportation Commission \n",
+ "1 f8102a9c0693206bf36d302540bf1bcf City of Corona \n",
+ "\n",
+ " name caltrans_district service_date \n",
+ "0 VCTC GMV Schedule 07 - Los Angeles 2024-05-22 \n",
+ "1 Corona Schedule 08 - San Bernardino 2024-05-22 "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "crosswalk_df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "661b3b18-5735-443d-acfc-4f598882a661",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(189, 5)"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "crosswalk_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "d9d562b4-7ce1-4ea5-bdba-e8e0abbc3815",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "routes = pd.merge(\n",
+ " route_dir,\n",
+ " crosswalk_df,\n",
+ " on=[\"schedule_gtfs_dataset_key\", \"service_date\"],\n",
+ " how=\"left\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "3e80b935-ecb4-44cd-9174-72c0d4568e14",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# routes = pd.concat([thousand_oaks, visalia, metro, metro_33])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "6f1b4c16-841d-43a1-89d6-fb4bcd394786",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['17', '219', '228', ..., '10867636', '10867637', '11096761'],\n",
+ " dtype=object)"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "routes.route_id.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "796640fb-35e0-42d1-8cd0-cc10c31befcf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " route_id | \n",
+ " direction_id | \n",
+ " time_period | \n",
+ " route_primary_direction | \n",
+ " frequency | \n",
+ " service_date | \n",
+ " organization_name | \n",
+ " name | \n",
+ " caltrans_district | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 0.00 | \n",
+ " all_day | \n",
+ " Northbound | \n",
+ " 0.92 | \n",
+ " 2024-05-22 | \n",
+ " Marin County Transit District | \n",
+ " Bay Area 511 Marin Schedule | \n",
+ " 04 - Oakland | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 0.00 | \n",
+ " offpeak | \n",
+ " Northbound | \n",
+ " 0.62 | \n",
+ " 2024-05-22 | \n",
+ " Marin County Transit District | \n",
+ " Bay Area 511 Marin Schedule | \n",
+ " 04 - Oakland | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 015d67d5b75b5cf2b710bbadadfb75f5 | \n",
+ " 17 | \n",
+ " 0.00 | \n",
+ " peak | \n",
+ " Northbound | \n",
+ " 1.50 | \n",
+ " 2024-05-22 | \n",
+ " Marin County Transit District | \n",
+ " Bay Area 511 Marin Schedule | \n",
+ " 04 - Oakland | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " schedule_gtfs_dataset_key route_id direction_id time_period \\\n",
+ "0 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n",
+ "1 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 offpeak \n",
+ "2 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 peak \n",
+ "\n",
+ " route_primary_direction frequency service_date \\\n",
+ "0 Northbound 0.92 2024-05-22 \n",
+ "1 Northbound 0.62 2024-05-22 \n",
+ "2 Northbound 1.50 2024-05-22 \n",
+ "\n",
+ " organization_name name \\\n",
+ "0 Marin County Transit District Bay Area 511 Marin Schedule \n",
+ "1 Marin County Transit District Bay Area 511 Marin Schedule \n",
+ "2 Marin County Transit District Bay Area 511 Marin Schedule \n",
+ "\n",
+ " caltrans_district \n",
+ "0 04 - Oakland \n",
+ "1 04 - Oakland \n",
+ "2 04 - Oakland "
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "routes.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9ca3a3e0-561f-4588-b71d-80abe692215a",
+ "metadata": {},
+ "source": [
+ "### Add Trips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "674f8ee6-3f6d-4f90-90ba-7ea9ee688b28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "TABLE = GTFS_DATA_DICT.schedule_downloads.trips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "5333afa8-4849-4864-b253-b03a1093e84f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FILE = f\"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "111e2c27-337d-4440-a1f6-10ec582a6f9e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_subset = [\n",
+ " \"gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"trip_instance_key\",\n",
+ " \"shape_array_key\",\n",
+ " \"feed_key\",\n",
+ " \"route_long_name\",\n",
+ " \"direction_id\",\n",
+ " \"route_type\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "e1aed68a-0ed2-4da4-904a-cb91681e7f38",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips = pd.read_parquet(FILE)[trips_subset].rename(\n",
+ " columns={\"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\"}\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "04901f96-76d5-4d99-b3d1-f174ef989357",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes = pd.merge(\n",
+ " trips,\n",
+ " routes,\n",
+ " on=[\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"],\n",
+ " how=\"inner\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "1195177f-0215-4b24-9e47-dccfb3ee542e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(392497, 15)"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trips_routes.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "9c1ba23b-30df-4916-a522-eb70bd5afdb9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1338"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trips_routes.route_id.nunique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3fe48162-5b42-4bf9-8853-a9d742e9d03b",
+ "metadata": {},
+ "source": [
+ "#### I know we can get this from the warehouse but it seems cumbersome. Correct me if I'm wrong."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "d9688e03-4b61-4736-b9d5-3539b0de80b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# https://gtfs.org/documentation/schedule/reference/#\n",
+ "route_type_crosswalk = {\n",
+ " \"route_type\": [\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"11\", \"12\"],\n",
+ " \"route_type_str\": [\n",
+ " \"Tram, Streetcar, Light rail\",\n",
+ " \"Subway, Metro\",\n",
+ " \"Rail\",\n",
+ " \"Bus\",\n",
+ " \"Ferry.\",\n",
+ " \"Cable tram.\",\n",
+ " \"Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).\",\n",
+ " \"Funicular.\",\n",
+ " \"Trolleybus.\",\n",
+ " \"Monorail.\",\n",
+ " ],\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "8dac05a6-0ba9-472b-85a2-5a0081550efb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "cd13aa3e-f222-49f8-b923-1e9e901f7bfb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Merge for route_type\n",
+ "trips_routes = pd.merge(\n",
+ " trips_routes, route_type_crosswalk_df, on=[\"route_type\"], how=\"left\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "ae72b7fc-ec7b-4dcd-8553-ac2abce5da1d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes = trips_routes.drop(columns=[\"route_type\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "cd31791a-fcb7-4731-90d7-bf5606dd4ce3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes = trips_routes.rename(columns={\"route_type_str\": \"route_type\"})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e3be8778-84ff-479f-a3b2-178e374da5f2",
+ "metadata": {},
+ "source": [
+ "### Get Stop Times "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "3f6727c6-205f-4cc8-8a68-42e8dec6e4b3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rt_stop_times = pd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "041731b3-0fcd-4b3e-8d01-f84460dd5fab",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " trip_id | \n",
+ " stop_id | \n",
+ " stop_sequence | \n",
+ " scheduled_arrival_sec | \n",
+ " schedule_gtfs_dataset_key | \n",
+ " trip_instance_key | \n",
+ " rt_arrival_sec | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1d105244-776c-4b3f-af78-9c7ad78c2103 | \n",
+ " 0b2443b6-b50f-452b-a749-464588ca93b8 | \n",
+ " 8 | \n",
+ " 60991.00 | \n",
+ " 1fd2f07342d966919b15d5d37fda8cc8 | \n",
+ " 45ae17540ca9fb5030c84dbb12e48e9a | \n",
+ " 61434 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1d105244-776c-4b3f-af78-9c7ad78c2103 | \n",
+ " cd5650b0-9a18-4e78-aedc-385f3094fa0f | \n",
+ " 9 | \n",
+ " 61179.00 | \n",
+ " 1fd2f07342d966919b15d5d37fda8cc8 | \n",
+ " 45ae17540ca9fb5030c84dbb12e48e9a | \n",
+ " 61616 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " trip_id stop_id \\\n",
+ "0 1d105244-776c-4b3f-af78-9c7ad78c2103 0b2443b6-b50f-452b-a749-464588ca93b8 \n",
+ "1 1d105244-776c-4b3f-af78-9c7ad78c2103 cd5650b0-9a18-4e78-aedc-385f3094fa0f \n",
+ "\n",
+ " stop_sequence scheduled_arrival_sec schedule_gtfs_dataset_key \\\n",
+ "0 8 60991.00 1fd2f07342d966919b15d5d37fda8cc8 \n",
+ "1 9 61179.00 1fd2f07342d966919b15d5d37fda8cc8 \n",
+ "\n",
+ " trip_instance_key rt_arrival_sec \n",
+ "0 45ae17540ca9fb5030c84dbb12e48e9a 61434 \n",
+ "1 45ae17540ca9fb5030c84dbb12e48e9a 61616 "
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rt_stop_times.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "049a2833-f132-431a-8f44-92f31cd11d8a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times = pd.merge(\n",
+ " rt_stop_times,\n",
+ " trips_routes,\n",
+ " on=[\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"trip_instance_key\",\n",
+ " ],\n",
+ " how=\"inner\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "7c59d770-d379-422e-a23d-9140c23df375",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "44988"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(trips_routes_times.scheduled_arrival_sec.isna().sum())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "4b2df823-ec98-483d-b30f-9072c62704b6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Int64Index: 11031910 entries, 0 to 11031909\n",
+ "Data columns (total 20 columns):\n",
+ " # Column Dtype \n",
+ "--- ------ ----- \n",
+ " 0 trip_id object \n",
+ " 1 stop_id object \n",
+ " 2 stop_sequence int64 \n",
+ " 3 scheduled_arrival_sec float64 \n",
+ " 4 schedule_gtfs_dataset_key object \n",
+ " 5 trip_instance_key object \n",
+ " 6 rt_arrival_sec int64 \n",
+ " 7 route_id object \n",
+ " 8 shape_array_key object \n",
+ " 9 feed_key object \n",
+ " 10 route_long_name object \n",
+ " 11 direction_id float64 \n",
+ " 12 time_period object \n",
+ " 13 route_primary_direction object \n",
+ " 14 frequency float64 \n",
+ " 15 service_date datetime64[ns]\n",
+ " 16 organization_name object \n",
+ " 17 name object \n",
+ " 18 caltrans_district object \n",
+ " 19 route_type object \n",
+ "dtypes: datetime64[ns](1), float64(3), int64(2), object(14)\n",
+ "memory usage: 1.7+ GB\n"
+ ]
+ }
+ ],
+ "source": [
+ "trips_routes_times.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2d09574e-464e-4f31-8f53-0596911dcabe",
+ "metadata": {},
+ "source": [
+ "### Sorting "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "0ef36e93-79ed-4f86-b16a-9d28d90aea1a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2 = trips_routes_times.sort_values(\n",
+ " by=[\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_long_name\",\n",
+ " \"shape_array_key\",\n",
+ " \"direction_id\",\n",
+ " \"stop_sequence\",\n",
+ " \"stop_id\",\n",
+ " \"rt_arrival_sec\",\n",
+ " ]\n",
+ ").reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "976db21c-45da-4773-8285-fe4a33e14152",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview_sort_col = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " \"stop_sequence\",\n",
+ " \"rt_arrival_sec\",\n",
+ " \"stop_id\",\n",
+ " \"scheduled_arrival_sec\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "b0bbe80c-75bf-4d3e-b5e7-891791114291",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(21009, 7)"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trips_routes_times2.loc[\n",
+ " (\n",
+ " trips_routes_times2.schedule_gtfs_dataset_key\n",
+ " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n",
+ " )\n",
+ " & (trips_routes_times2.route_id == \"204-13172\")\n",
+ " & (trips_routes_times2.direction_id == 1)\n",
+ "][preview_sort_col].shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "48737ec7-31be-4743-97ca-c6c000670a13",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Convert scheduled and RT arrival times.\n",
+ "* If 82800 < `scheduled_arrival_time` < 86_400 but `rt_arrival_sec` is lower say 14_000 (4 am in the morning): then the bus was scheduled to arrive on May 21 (day before the service date) but it arrived a little later on the actual service date we query the data for. \n",
+ "* If 86_400 < `scheduled_arrival_time` and `rt_arrival_sec` is around 86_000 then this is the same service date. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "d4e5a2df-a110-4347-9c6d-7f9bdfdbce1b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 11031910.00\n",
+ "mean 48176.39\n",
+ "std 17806.20\n",
+ "min 0.00\n",
+ "25% 33526.00\n",
+ "50% 48304.00\n",
+ "75% 62400.00\n",
+ "max 86399.00\n",
+ "Name: rt_arrival_sec, dtype: float64"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trips_routes_times2[\"rt_arrival_sec\"].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "97e3dc65-45fc-4318-a5f6-b0f497a6ab04",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1024, 20)"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trips_routes_times2.loc[trips_routes_times2[\"scheduled_arrival_sec\"] == 86_400].shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "630e4f7c-73c5-4f90-9da8-bf50378f3996",
+ "metadata": {},
+ "source": [
+ "#### Filter out for trips with `scheduled_arrival_sec` that's over 24 hours."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "099c6838-aa42-4eb5-a250-0ac3e054f834",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "timestamp_subset = [\n",
+ " \"converted_schd_arrival\",\n",
+ " \"converted_rt_arrival\",\n",
+ " \"scheduled_arrival_sec\",\n",
+ " \"rt_arrival_sec\",\n",
+ " \"service_date\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "d64637d8-ca0d-4e3c-b956-13d8e64e513c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def adjust_sched_arrival_seconds(sched_arrival_seconds, date, rt_arrival_sec):\n",
+ " \"\"\"\n",
+ " Adjusts days and time of sched_arrival_seconds because it runs over 24 hours\n",
+ " based on a combination of sched_arrival_seconds and rt_arrival_sec criteria.\n",
+ "\n",
+ " Parameters:\n",
+ " sched_arrival_seconds (int): Number of seconds.\n",
+ " date (datetime): Initial date.\n",
+ " rt_arrival_sec (int): Arrival time in seconds.\n",
+ "\n",
+ " Returns:\n",
+ " datetime: Adjusted date and time for sched_arrival_seconds\n",
+ " \"\"\"\n",
+ " # If the rt arrival second is between 12-1AM and the scheduled arrival time is between\n",
+ " # is between 11pm and 1am subtract a day\n",
+ " if rt_arrival_sec < (60 * 60) and (82_800 < sched_arrival_seconds < 90_000):\n",
+ " return pd.Timestamp(date + pd.Timedelta(days=-1)) + pd.Timedelta(\n",
+ " seconds=sched_arrival_seconds % 86400\n",
+ " )\n",
+ " else:\n",
+ " # No change\n",
+ " return pd.Timestamp(date) + pd.Timedelta(seconds=sched_arrival_seconds)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "75bfaa72-2fbc-4b88-9751-10744cdfc193",
+ "metadata": {},
+ "source": [
+ "#### Subset to make the df smaller\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "2f0efe6f-81ac-4382-9f9f-5455736a3026",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "subset = [\n",
+ " \"stop_id\",\n",
+ " \"stop_sequence\",\n",
+ " \"scheduled_arrival_sec\",\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"trip_instance_key\",\n",
+ " \"rt_arrival_sec\",\n",
+ " \"route_id\",\n",
+ " \"shape_array_key\",\n",
+ " \"route_long_name\",\n",
+ " \"direction_id\",\n",
+ " \"organization_name\",\n",
+ " \"caltrans_district\",\n",
+ " \"service_date\",\n",
+ " \"route_type\",\n",
+ " \"feed_key\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "bea6dc67-8a1c-484a-b141-43f29af83df9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2 = trips_routes_times2[subset]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "id": "41f6aded-7f9c-4f5e-ba7a-2a52769a3e44",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#trips_routes_times2[\"converted_schd_arrival2\"] = trips_routes_times2.apply(\n",
+ " # lambda row: adjust_sched_arrival_seconds(\n",
+ " # row[\"scheduled_arrival_sec\"], row[\"service_date\"], row[\"rt_arrival_sec\"]\n",
+ " # ),\n",
+ "# axis=1,\n",
+ "#)#"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "3ce579d1-0392-41a6-b535-9f4422a78216",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\"converted_rt_arrival\"] = pd.to_datetime(\n",
+ " trips_routes_times2[\"service_date\"]\n",
+ ") + pd.to_timedelta(trips_routes_times2[\"rt_arrival_sec\"] % 86400, unit=\"s\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "1d3e4966-1f57-4117-9f73-506055a42d7b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\"converted_schd_arrival\"] = pd.to_datetime(\n",
+ " trips_routes_times2[\"service_date\"]\n",
+ ") + pd.to_timedelta(trips_routes_times2[\"scheduled_arrival_sec\"] % 86400, unit=\"s\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b44fa6d1-edf4-42be-af6f-3bc777b091c1",
+ "metadata": {},
+ "source": [
+ "#### Checkout results from the code\n",
+ "* Make sure it makes sense.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "60d18d14-1f6b-4b97-a736-cb295b3a6382",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " scheduled_arrival_sec | \n",
+ " converted_schd_arrival | \n",
+ " rt_arrival_sec | \n",
+ " converted_rt_arrival | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1575402 | \n",
+ " 87180.00 | \n",
+ " 2024-05-22 00:13:00 | \n",
+ " 846 | \n",
+ " 2024-05-22 00:14:06 | \n",
+ "
\n",
+ " \n",
+ " 573074 | \n",
+ " 91800.00 | \n",
+ " 2024-05-22 01:30:00 | \n",
+ " 5396 | \n",
+ " 2024-05-22 01:29:56 | \n",
+ "
\n",
+ " \n",
+ " 4038881 | \n",
+ " 103973.00 | \n",
+ " 2024-05-22 04:52:53 | \n",
+ " 17625 | \n",
+ " 2024-05-22 04:53:45 | \n",
+ "
\n",
+ " \n",
+ " 1823671 | \n",
+ " 102300.00 | \n",
+ " 2024-05-22 04:25:00 | \n",
+ " 15783 | \n",
+ " 2024-05-22 04:23:03 | \n",
+ "
\n",
+ " \n",
+ " 4312349 | \n",
+ " 86691.00 | \n",
+ " 2024-05-22 00:04:51 | \n",
+ " 209 | \n",
+ " 2024-05-22 00:03:29 | \n",
+ "
\n",
+ " \n",
+ " 571749 | \n",
+ " 89160.00 | \n",
+ " 2024-05-22 00:46:00 | \n",
+ " 2689 | \n",
+ " 2024-05-22 00:44:49 | \n",
+ "
\n",
+ " \n",
+ " 4083191 | \n",
+ " 98040.00 | \n",
+ " 2024-05-22 03:14:00 | \n",
+ " 11699 | \n",
+ " 2024-05-22 03:14:59 | \n",
+ "
\n",
+ " \n",
+ " 4046767 | \n",
+ " 87111.00 | \n",
+ " 2024-05-22 00:11:51 | \n",
+ " 595 | \n",
+ " 2024-05-22 00:09:55 | \n",
+ "
\n",
+ " \n",
+ " 7397112 | \n",
+ " 86820.00 | \n",
+ " 2024-05-22 00:07:00 | \n",
+ " 397 | \n",
+ " 2024-05-22 00:06:37 | \n",
+ "
\n",
+ " \n",
+ " 772826 | \n",
+ " 89160.00 | \n",
+ " 2024-05-22 00:46:00 | \n",
+ " 4180 | \n",
+ " 2024-05-22 01:09:40 | \n",
+ "
\n",
+ " \n",
+ " 1891015 | \n",
+ " 86760.00 | \n",
+ " 2024-05-22 00:06:00 | \n",
+ " 910 | \n",
+ " 2024-05-22 00:15:10 | \n",
+ "
\n",
+ " \n",
+ " 370261 | \n",
+ " 96960.00 | \n",
+ " 2024-05-22 02:56:00 | \n",
+ " 10192 | \n",
+ " 2024-05-22 02:49:52 | \n",
+ "
\n",
+ " \n",
+ " 4829315 | \n",
+ " 87035.00 | \n",
+ " 2024-05-22 00:10:35 | \n",
+ " 634 | \n",
+ " 2024-05-22 00:10:34 | \n",
+ "
\n",
+ " \n",
+ " 1264788 | \n",
+ " 95340.00 | \n",
+ " 2024-05-22 02:29:00 | \n",
+ " 9033 | \n",
+ " 2024-05-22 02:30:33 | \n",
+ "
\n",
+ " \n",
+ " 4415882 | \n",
+ " 100280.00 | \n",
+ " 2024-05-22 03:51:20 | \n",
+ " 14138 | \n",
+ " 2024-05-22 03:55:38 | \n",
+ "
\n",
+ " \n",
+ " 3738705 | \n",
+ " 86938.00 | \n",
+ " 2024-05-22 00:08:58 | \n",
+ " 653 | \n",
+ " 2024-05-22 00:10:53 | \n",
+ "
\n",
+ " \n",
+ " 2998940 | \n",
+ " 89100.00 | \n",
+ " 2024-05-22 00:45:00 | \n",
+ " 3014 | \n",
+ " 2024-05-22 00:50:14 | \n",
+ "
\n",
+ " \n",
+ " 2076139 | \n",
+ " 98100.00 | \n",
+ " 2024-05-22 03:15:00 | \n",
+ " 11952 | \n",
+ " 2024-05-22 03:19:12 | \n",
+ "
\n",
+ " \n",
+ " 3944449 | \n",
+ " 96840.00 | \n",
+ " 2024-05-22 02:54:00 | \n",
+ " 10873 | \n",
+ " 2024-05-22 03:01:13 | \n",
+ "
\n",
+ " \n",
+ " 4082972 | \n",
+ " 106965.00 | \n",
+ " 2024-05-22 05:42:45 | \n",
+ " 20378 | \n",
+ " 2024-05-22 05:39:38 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " scheduled_arrival_sec converted_schd_arrival rt_arrival_sec \\\n",
+ "1575402 87180.00 2024-05-22 00:13:00 846 \n",
+ "573074 91800.00 2024-05-22 01:30:00 5396 \n",
+ "4038881 103973.00 2024-05-22 04:52:53 17625 \n",
+ "1823671 102300.00 2024-05-22 04:25:00 15783 \n",
+ "4312349 86691.00 2024-05-22 00:04:51 209 \n",
+ "571749 89160.00 2024-05-22 00:46:00 2689 \n",
+ "4083191 98040.00 2024-05-22 03:14:00 11699 \n",
+ "4046767 87111.00 2024-05-22 00:11:51 595 \n",
+ "7397112 86820.00 2024-05-22 00:07:00 397 \n",
+ "772826 89160.00 2024-05-22 00:46:00 4180 \n",
+ "1891015 86760.00 2024-05-22 00:06:00 910 \n",
+ "370261 96960.00 2024-05-22 02:56:00 10192 \n",
+ "4829315 87035.00 2024-05-22 00:10:35 634 \n",
+ "1264788 95340.00 2024-05-22 02:29:00 9033 \n",
+ "4415882 100280.00 2024-05-22 03:51:20 14138 \n",
+ "3738705 86938.00 2024-05-22 00:08:58 653 \n",
+ "2998940 89100.00 2024-05-22 00:45:00 3014 \n",
+ "2076139 98100.00 2024-05-22 03:15:00 11952 \n",
+ "3944449 96840.00 2024-05-22 02:54:00 10873 \n",
+ "4082972 106965.00 2024-05-22 05:42:45 20378 \n",
+ "\n",
+ " converted_rt_arrival \n",
+ "1575402 2024-05-22 00:14:06 \n",
+ "573074 2024-05-22 01:29:56 \n",
+ "4038881 2024-05-22 04:53:45 \n",
+ "1823671 2024-05-22 04:23:03 \n",
+ "4312349 2024-05-22 00:03:29 \n",
+ "571749 2024-05-22 00:44:49 \n",
+ "4083191 2024-05-22 03:14:59 \n",
+ "4046767 2024-05-22 00:09:55 \n",
+ "7397112 2024-05-22 00:06:37 \n",
+ "772826 2024-05-22 01:09:40 \n",
+ "1891015 2024-05-22 00:15:10 \n",
+ "370261 2024-05-22 02:49:52 \n",
+ "4829315 2024-05-22 00:10:34 \n",
+ "1264788 2024-05-22 02:30:33 \n",
+ "4415882 2024-05-22 03:55:38 \n",
+ "3738705 2024-05-22 00:10:53 \n",
+ "2998940 2024-05-22 00:50:14 \n",
+ "2076139 2024-05-22 03:19:12 \n",
+ "3944449 2024-05-22 03:01:13 \n",
+ "4082972 2024-05-22 05:39:38 "
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trips_routes_times2.loc[trips_routes_times2[\"scheduled_arrival_sec\"] > 86_400][\n",
+ " [\n",
+ " \"scheduled_arrival_sec\",\n",
+ " \"converted_schd_arrival\",\n",
+ " \"rt_arrival_sec\",\n",
+ " \"converted_rt_arrival\",\n",
+ " ]\n",
+ "].sample(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "31b10b40-73bd-4b02-92b7-bf0cacaf67b9",
+ "metadata": {},
+ "source": [
+ "### Deal with delays\n",
+ "* Some very extreme values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8325eed2-412b-4202-9d57-252db2fd7e26",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 10986922.00\n",
+ "mean 1.90\n",
+ "std 35.79\n",
+ "min -1439.78\n",
+ "25% -0.23\n",
+ "50% 1.45\n",
+ "75% 4.05\n",
+ "max 1439.98\n",
+ "Name: delay_min, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "more_than_86400 = deal_with_23_hours(trips_routes_times2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "003abe45-5d55-4839-80f8-83e693214427",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "13d8ff3a-1ed5-46bb-af2e-86eaa7616046",
+ "metadata": {},
+ "source": [
+ "### Help, can't fix everythin...how to address these edge cases? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8f0e5d8a-5e26-4c75-b593-2cc9aa275a35",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2.converted_schd_arrival.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fefa0cfc-79af-4538-b613-bf4669daecd0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\n",
+ " trips_routes_times2[\"converted_schd_arrival\"].dt.strftime(\"%Y-%m-%d\")\n",
+ " == \"2024-05-21\"\n",
+ "].head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1844a0a-26e0-473f-88a0-3c9bd6748fac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\n",
+ " trips_routes_times2[\"converted_schd_arrival\"].dt.strftime(\"%Y-%m-%d\")\n",
+ " == \"2024-05-23\"\n",
+ "].head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "84a4772d-653b-49dd-8db2-33e67f3c0708",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2.converted_rt_arrival.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "039bd4bb-999d-405b-85a3-8b6e2f41c454",
+ "metadata": {},
+ "source": [
+ "#### Question: Last time, I received the suggestion to throw away things more than 2 hrs because that is not bunching. But wouldn't this be misleading as a metric when we want to calculate all the rows?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a16335dc-47a0-4ea1-9612-fc6d5dfea1d6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(trips_routes_times2.delay_min.describe(percentiles))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "076d1774-5ddb-43e9-99f8-729deec0357c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# trips_routes_times2 = trips_routes_times2.loc[\n",
+ "# trips_routes_times2.delay_min < 120\n",
+ "# ].reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4029214f-47c8-4a4e-917b-dd5e70bbae25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# trips_routes_times2 = trips_routes_times2.loc[\n",
+ "# trips_routes_times2.delay_min > -120\n",
+ "# ].reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "81bd753a-08ee-4d09-ac79-213e1e605405",
+ "metadata": {},
+ "source": [
+ "### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b9171520-7358-4f22-9d9c-c5156e710f1b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "groupby_cols = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_long_name\",\n",
+ " \"direction_id\",\n",
+ " \"stop_id\",\n",
+ " \"stop_sequence\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1ab53c4-bd2d-44cc-8e4d-db7801cb5a3b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\"actual_arrival_lag_min\"] = (\n",
+ " trips_routes_times2.groupby(groupby_cols)[\"converted_rt_arrival\"]\n",
+ " .diff()\n",
+ " .dt.total_seconds()\n",
+ " / 60\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6328c5fb-205e-4a78-a3d3-62fafa88a4cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\"scheduled_arrival_lag_min\"] = (\n",
+ " trips_routes_times2.groupby(groupby_cols)[\"converted_schd_arrival\"]\n",
+ " .diff()\n",
+ " .dt.total_seconds()\n",
+ " / 60\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7fda264c-9496-4164-95d1-f7c6e9fb1784",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\"scheduled_arrival_lag_min\"].describe(percentiles)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "875b27f9-5cd5-496a-a9e3-df7f609e4dd7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\"actual_arrival_lag_min\"].describe(percentiles)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0370dc8b-8a4d-4e16-8943-a559aa0aac07",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview_time_col = [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"route_id\",\n",
+ " \"stop_id\",\n",
+ " \"stop_sequence\",\n",
+ " \"converted_rt_arrival\",\n",
+ " \"actual_arrival_lag_min\",\n",
+ " \"converted_schd_arrival\",\n",
+ " \"scheduled_arrival_lag_min\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1ba17b18-b9cb-4580-8eeb-47d9d058e9f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2.loc[\n",
+ " (\n",
+ " trips_routes_times2.schedule_gtfs_dataset_key\n",
+ " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n",
+ " )\n",
+ " & (trips_routes_times2.route_id == \"204-13172\")\n",
+ " & (trips_routes_times2.stop_id == \"3961\")\n",
+ "][preview_time_col]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8a5f6f72-d463-4076-80ce-d22ab1f718b1",
+ "metadata": {},
+ "source": [
+ "### Many lags are actually empty b/c it's the first of that groupby-sequence."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "98df5908-9530-43c8-ab6c-9283f9cc78d0",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[trips_routes_times2[\"scheduled_arrival_lag_min\"].isna()][\n",
+ " preview_time_col\n",
+ "].sample()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b58b89fc-f675-4538-b456-d91431e30229",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "trips_routes_times2.loc[\n",
+ " (\n",
+ " trips_routes_times2.schedule_gtfs_dataset_key\n",
+ " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n",
+ " )\n",
+ " & (trips_routes_times2.route_id == \"204-13172\")\n",
+ " & (trips_routes_times2.stop_sequence == 2)\n",
+ "][preview_time_col]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22564ce6-554d-4a2d-81cc-5b98018116b4",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "trips_routes_times2.loc[\n",
+ " (\n",
+ " trips_routes_times2.schedule_gtfs_dataset_key\n",
+ " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n",
+ " )\n",
+ " & (trips_routes_times2.route_id == \"204-13172\")\n",
+ " & (trips_routes_times2.stop_sequence == 59)\n",
+ "][preview_time_col]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "908b3be6-72b5-483e-8be5-a2932c59cdd4",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "trips_routes_times2.loc[\n",
+ " (\n",
+ " trips_routes_times2.schedule_gtfs_dataset_key\n",
+ " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n",
+ " )\n",
+ " & (trips_routes_times2.route_id == \"204-13172\")\n",
+ " & (trips_routes_times2.stop_sequence == 46)\n",
+ " & (trips_routes_times2.stop_id == \"5685\")\n",
+ "][preview_time_col]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1997e407-9483-4142-95db-2a1892fbace7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# rt_stop_times4 = rt_stop_times4.fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "28362518-a54b-4f5d-a4d7-24a3d8ddefd0",
+ "metadata": {},
+ "source": [
+ "### Transit Matters Method"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f0f0f4b7-fa64-4b01-a141-5dd78c59693b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transit_matters_df1 = trips_routes_times2.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7bea63e5-45d0-4d06-8c1c-fd34a69ffde7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transit_matters_df1[\"pct_actual_schd_headway\"] = (\n",
+ " transit_matters_df1.actual_arrival_lag_min\n",
+ " / transit_matters_df1.scheduled_arrival_lag_min\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c67c6299-68f0-414f-a9c1-e0b27511b9e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transit_matters_df1[\"bunched_y_n\"] = np.where(\n",
+ " transit_matters_df1[\"pct_actual_schd_headway\"] < 0.25, \"bunched\", \"not bunched\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d7d56ce7-66e9-4084-a725-a9eff7c4c5b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db10254b-d5d4-4619-9c6e-10fab19ec6b2",
+ "metadata": {},
+ "source": [
+ "#### Aggregate.\n",
+ "* At this point, it doesn't matter the sequence, we just care about how bunched the traffic is around one partiuclar stop. \n",
+ "* See how many trips for that grain are considered \"bunched\" or not."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30bbbbad-4587-43e3-b5c7-632079f5a588",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def bunched_not_bunched(\n",
+ " df: pd.DataFrame, bunched_y_n: str, groupby_cols: list\n",
+ ") -> pd.DataFrame:\n",
+ " df2 = df.loc[df.bunched_y_n == bunched_y_n].reset_index(drop=True)\n",
+ "\n",
+ " bunched_y_n = bunched_y_n.replace(\" \", \"_\")\n",
+ " agg1 = (\n",
+ " df2.groupby(groupby_cols).agg({\"trip_instance_key\": \"nunique\"}).reset_index()\n",
+ " ).rename(columns={\"trip_instance_key\": f\"{bunched_y_n}_trips\"})\n",
+ " return agg1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bb7f41ff-ff97-4a00-b4f8-b9cdab73da64",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def agg_final_df(df: pd.DataFrame) -> pd.DataFrame:\n",
+ " groupby_cols = [\n",
+ " \"caltrans_district\",\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"feed_key\",\n",
+ " \"organization_name\",\n",
+ " \"route_long_name\",\n",
+ " \"route_type\",\n",
+ " \"route_id\",\n",
+ " \"direction_id\",\n",
+ " \"stop_id\",\n",
+ " \"stop_sequence\",\n",
+ " ]\n",
+ "\n",
+ " # Find total trips that are bunched\n",
+ " bunched = bunched_not_bunched(df, \"bunched\", groupby_cols)\n",
+ "\n",
+ " # Find total trips that are NOT bunched\n",
+ " not_bunched = bunched_not_bunched(df, \"not bunched\", groupby_cols)\n",
+ "\n",
+ " # Merge\n",
+ " m1 = pd.merge(not_bunched, bunched, on=groupby_cols, how=\"outer\")\n",
+ "\n",
+ " # Find the % of bunched trips\n",
+ " m1 = m1.fillna(0)\n",
+ " m1[\"all_trips\"] = m1.not_bunched_trips + m1.bunched_trips\n",
+ " m1[\"per_trip_bunched_per_stop\"] = m1.bunched_trips / m1.all_trips\n",
+ "\n",
+ " # Filter out any rows with only one trip of that groupby combo\n",
+ " # for that service date\n",
+ " m1 = m1.loc[m1.all_trips > 1].reset_index(drop=True)\n",
+ " m1 = m1.drop(columns=[\"not_bunched_trips\", \"bunched_trips\"])\n",
+ "\n",
+ " return m1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a33d4b24-6737-4c2a-8e38-eec929242b37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transit_matters_m1 = agg_final_df(transit_matters_df1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9a919bad-bb6f-4247-840d-e6f00a349b06",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# transit_matters_m1 = (\n",
+ "# transit_matters_m1.sort_values(by=[\"all_trips\"], ascending=False)\n",
+ "# .drop_duplicates(subset=transit_matters_agg)\n",
+ "# .reset_index(drop=True)\n",
+ "# )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "710770ad-2d1a-4636-9667-02d320b689e8",
+ "metadata": {},
+ "source": [
+ "### Help: Swapped order of a bus is messing with the transit matters metric.\n",
+ "* How to solve for this?? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "56e4a936-215b-4bdd-8f45-e86b613198bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview_cols = [\n",
+ " \"converted_rt_arrival\",\n",
+ " \"actual_arrival_lag_min\",\n",
+ " \"converted_schd_arrival\",\n",
+ " \"scheduled_arrival_lag_min\",\n",
+ " \"pct_actual_schd_headway\",\n",
+ " \"bunched_y_n\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e24cb76-4f68-4377-a0df-cedb21904b6e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "example2 = transit_matters_df1.loc[\n",
+ " (transit_matters_df1.stop_id == \"2307719\")\n",
+ " & (transit_matters_df1.organization_name == \"City of Visalia\")\n",
+ " & (transit_matters_df1.route_id == \"2042\")\n",
+ " & (transit_matters_df1.shape_array_key == \"60da59c7000ea5dcb5f845d8fa227f14\")\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67169748-2cba-4591-8cd5-3cc8e4e3a556",
+ "metadata": {},
+ "source": [
+ "#### Starting row 33484: the RT Arrival time is swapped. A bus that was scheduled to arrive at 4:27 arrived boefre the bus arrived at 3:42.\n",
+ "* This repeats again row 33486."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8b34f0a6-6a2b-4e56-b0fe-bd0f57b53db8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "example2[preview_cols]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b0579e78-2a95-4d8b-9761-2824aa39a8eb",
+ "metadata": {},
+ "source": [
+ "### Use 2 minute benchmark\n",
+ "* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)\n",
+ "* Justifying the use of\n",
+ "headway maintenance. For example, in April\n",
+ "2022 the 66 bus significantly bunched around\n",
+ "several stops. When bunching is defined as\n",
+ "buses that run within two minutes or less of\n",
+ "each other, inbound buses towards Nubian\n",
+ "Square bunched 10% of the time at Brigham\n",
+ "Circle, 9% at Brookline Village and Roxbury\n",
+ "Crossing, and 8% of the time at Coolidge\n",
+ "Corner. Bunching is even more dramatic\n",
+ "outbound towards Harvard Square where\n",
+ "buses bunched over 35% of the time at Winship\n",
+ "St, 13% at Coolidge Corner and Harvard Ave at\n",
+ "Commonwealth Ave, and 12% at North Harvard\n",
+ "St at Western Ave. View more data about bus\n",
+ "bunching through the TransitMatters Data\n",
+ "Dashboard here.\n",
+ "\n",
+ "* To Do: add back in route & operator information"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e0706e7e-0d56-43b2-bf3c-4205e9277c64",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "two_minutes_df = trips_routes_times2.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2def9283-d995-4001-b412-0fa03a855cd5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "two_minutes_df[\"bunched_y_n\"] = np.where(\n",
+ " two_minutes_df[\"actual_arrival_lag_min\"] <= 2, \"bunched\", \"not bunched\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dcd302cb-5f30-4318-8b28-cb29f6c376cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "two_minutes_df.bunched_y_n.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99de2fd0-2aed-4793-a33b-56d7ffc313bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_two_minute = agg_final_df(two_minutes_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "416da93d-cd6d-4ad1-bf5e-98b20c188661",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_two_minute.loc[\n",
+ " (final_two_minute.stop_id == \"2307695\")\n",
+ " & (final_two_minute.organization_name == \"City of Visalia\")\n",
+ " & (final_two_minute.route_id == \"2042\")\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a0b54b9-3d4c-4cc4-8243-e26a42c47e83",
+ "metadata": {},
+ "source": [
+ "### Comparing both outcomes\n",
+ "* There are so many more bunched trips for the 2 minute approach.\n",
+ "* Add back in schedule_gtfs_key and then grab stop level data from the warehouse."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a11e9bc2-70c6-488a-aa7e-f92d8b53c8e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "final_two_minute.per_trip_bunched_per_stop.describe(percentiles)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "19c741e4-6476-416b-a62d-51028c6eef68",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "transit_matters_m1.per_trip_bunched_per_stop.describe(percentiles)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a6304f84-80fb-4696-a312-d28545073b22",
+ "metadata": {},
+ "source": [
+ "### Make Visuals"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "220981aa-623c-4156-8445-f349f0d98d45",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freq_range = [\n",
+ " \"#ccbb44\",\n",
+ " \"#e9d868\",\n",
+ " \"#fcb40e\",\n",
+ " \"#ff9c42\",\n",
+ " \"#fc5c04\",\n",
+ " \"#dd217d\",\n",
+ " \"#dd217d\",\n",
+ " \"#dd217d\",\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "949a750a-6e07-4c98-8020-7b201b9bd0fa",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9a5ea9a8-1158-4681-9d97-37a131b2dff4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2[\"hour\"] = trips_routes_times2[\"converted_rt_arrival\"].dt.hour\n",
+ "trips_routes_times2[\"min\"] = trips_routes_times2[\"converted_rt_arrival\"].dt.minute"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "562aa5ef-6dac-443e-b646-88b72174d645",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trips_routes_times2.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec812e6c-3dc7-4e21-9478-1eae1a539ec2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def compare_approaches(\n",
+ " stop_id: str, organization_name: str, route_id: str, stop_sequence: int\n",
+ "):\n",
+ " transit_matter = transit_matters_m1.loc[\n",
+ " (transit_matters_m1.stop_id == stop_id)\n",
+ " & (transit_matters_m1.organization_name == organization_name)\n",
+ " & (transit_matters_m1.route_id == route_id)\n",
+ " & (transit_matters_m1.stop_sequence == stop_sequence)\n",
+ " ]\n",
+ " display(transit_matter)\n",
+ "\n",
+ " two_min = final_two_minute.loc[\n",
+ " (final_two_minute.stop_id == stop_id)\n",
+ " & (final_two_minute.organization_name == organization_name)\n",
+ " & (final_two_minute.route_id == route_id)\n",
+ " & (final_two_minute.stop_sequence == stop_sequence)\n",
+ " ]\n",
+ "\n",
+ " display(two_min)\n",
+ " total_trips = trips_routes_times2.loc[\n",
+ " (trips_routes_times2.stop_id == stop_id)\n",
+ " & (trips_routes_times2.organization_name == organization_name)\n",
+ " & (trips_routes_times2.route_id == route_id)\n",
+ " & (trips_routes_times2.stop_sequence == stop_sequence)\n",
+ " ]\n",
+ "\n",
+ " display(total_trips.trip_instance_key.nunique())\n",
+ "\n",
+ " chart = (\n",
+ " alt.Chart(total_trips)\n",
+ " .mark_circle(size=500)\n",
+ " .encode(\n",
+ " x=\"hour\",\n",
+ " y=\"min\",\n",
+ " color=alt.Color(\n",
+ " \"hour\",\n",
+ " scale=alt.Scale(range=freq_range),\n",
+ " ),\n",
+ " tooltip=[\"hour\", \"min\", \"actual_arrival_lag_min\"],\n",
+ " )\n",
+ " .properties(width=800, height=400)\n",
+ " )\n",
+ " display(chart)\n",
+ " return total_trips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5b9efd9b-6799-40a7-baff-3a936790ddd3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test1 = compare_approaches(\n",
+ " stop_id=\"5685\",\n",
+ " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n",
+ " route_id=\"204-13172\",\n",
+ " stop_sequence=46,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6486f4bf-890d-45ae-b6dc-8c938171a466",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test2 = compare_approaches(\n",
+ " stop_id=\"2307469\",\n",
+ " organization_name=\"City of Visalia\",\n",
+ " route_id=\"2042\",\n",
+ " stop_sequence=27,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "85a9a49c-0d2e-4d1a-9b7d-601880d45d5d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test3 = compare_approaches(\n",
+ " stop_id=\"3104\",\n",
+ " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n",
+ " route_id=\"33-13172\",\n",
+ " stop_sequence=80,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "713f259d-125b-473d-abff-877f3e2d4973",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test4 = compare_approaches(\n",
+ " stop_id=\"15320\",\n",
+ " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n",
+ " route_id=\"33-13172\",\n",
+ " stop_sequence=64,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b63c81fa-efdc-464b-a567-23ca9c194321",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test5 = compare_approaches(\n",
+ " stop_id=\"3288014\",\n",
+ " organization_name=\"City of Thousand Oaks\",\n",
+ " route_id=\"3402\",\n",
+ " stop_sequence=16,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a6be5a05-07e3-4a11-a5cd-844380717548",
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "transit_matters_m1.sort_values(by=[\"per_trip_bunched_per_stop\"], ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34ba711c-f004-4a62-81ef-0effbf5401ed",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "### Make Maps \n",
+ "* Think I actually need the vehicle positions since stops are literally the stop's geometry, so it'll always be plotting on the same spot.\n",
+ "\n",
+ "* https://github.com/cal-itp/data-analyses/blob/db19b70329f1e817236bda13707dd903c24abb4c/_shared_utils/shared_utils/gtfs_utils_v2.py#L371\n",
+ "* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/download_stops.py"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ef8e36d2-2209-4a99-95cf-11ca0371d93c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stop"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2c323fdc-14bd-442f-b3d0-5b5fa20a2e48",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# What is this file?\n",
+ "vps_gdf = gpd.read_parquet(\n",
+ " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2024-05-22.parquet\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e8b2d27e-0a1d-4d6d-a5e8-da7e915b4a24",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vps_gdf = vps_gdf[\n",
+ " [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"trip_instance_key\",\n",
+ " \"location_timestamp_local\",\n",
+ " \"geometry\",\n",
+ " ]\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fe5770c0-07e2-4cd0-905c-6b5a28f9a1ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vps_df = vps_gdf[\n",
+ " [\n",
+ " \"schedule_gtfs_dataset_key\",\n",
+ " \"trip_instance_key\",\n",
+ " \"location_timestamp_local\",\n",
+ " ]\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6016595b-b80d-4fef-938b-5d75e35c8a8d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vps_m1 = pd.merge(\n",
+ " vps_df,\n",
+ " trips_routes_times2,\n",
+ " on=[\"schedule_gtfs_dataset_key\", \"trip_instance_key\"],\n",
+ " how=\"inner\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "03064c85-8e73-4370-8c00-a0f34fa76029",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def one_stop(df: pd.DataFrame, stop_id: str, org_name: str, route_id: str):\n",
+ " # Look at one route & stop\n",
+ " test_route1 = df.loc[\n",
+ " (df.organization_name == org_name)\n",
+ " & (df.route_id == route_id)\n",
+ " & (df.stop_id == stop_id)\n",
+ " ]\n",
+ "\n",
+ " compare_approaches(stop_id=stop_id, organization_name=org_name, route_id=route_id)\n",
+ "\n",
+ " # display(test_route1.explore(\"time_int\", marker_kwds = {'radius':20}))\n",
+ " return test_route1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c513281-039e-45a5-98d0-9220e132f92a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "metro_test1 = one_stop(\n",
+ " vps_m1,\n",
+ " stop_id=\"5700\",\n",
+ " org_name=\"Los Angeles County Metropolitan Transportation Authority\",\n",
+ " route_id=\"204-13172\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a8d5439-0529-4adf-b71f-6bcfa6353392",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "metro_test1.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8ead8a95-85fd-454d-b7ce-d5a85d6406d5",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0baa05d0-40fe-4406-a80e-5acd80a8edae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "metro_test1.sample(1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "76a7e40c-c3d8-476c-b5e1-f38f7cc12345",
+ "metadata": {},
+ "source": [
+ "### Other\n",
+ "* https://www.sciencedirect.com/science/article/pii/S1366554523003666\n",
+ "* https://www.sciencedirect.com/science/article/pii/S0968090X22002492?ref=pdf_download&fr=RR-2&rr=8d7d6fb73d8015be"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}