From e2b02539becd9d123778891d29f1fbddcec2d2a3 Mon Sep 17 00:00:00 2001 From: amandaha8 Date: Fri, 8 Nov 2024 23:02:55 +0000 Subject: [PATCH] trying to apply approach to all ops --- gtfs_digest/37_transit_bunching_samples.ipynb | 86 +- gtfs_digest/41_transit_bunching_all.ipynb | 1725 ++++++++--- gtfs_digest/42_transit_bunching_kernel.ipynb | 2234 +++++++++++++++ gtfs_digest/43_transit_bunching.ipynb | 2540 +++++++++++++++++ 4 files changed, 6212 insertions(+), 373 deletions(-) create mode 100644 gtfs_digest/42_transit_bunching_kernel.ipynb create mode 100644 gtfs_digest/43_transit_bunching.ipynb diff --git a/gtfs_digest/37_transit_bunching_samples.ipynb b/gtfs_digest/37_transit_bunching_samples.ipynb index ce4ab095b..a882c0b66 100644 --- a/gtfs_digest/37_transit_bunching_samples.ipynb +++ b/gtfs_digest/37_transit_bunching_samples.ipynb @@ -41142,7 +41142,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_2446/3770378478.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n", + "/tmp/ipykernel_2867/3770378478.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n", " trips_routes_times2.converted_schd_arrival.describe()\n" ] }, @@ -41463,7 +41463,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_2446/452450045.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n", + "/tmp/ipykernel_2867/452450045.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n", " trips_routes_times2.converted_rt_arrival.describe()\n" ] }, @@ -41839,14 +41839,14 @@ " \n", " \n", " \n", - " 7746\n", + " 10317\n", " 0666caf3ec1ecc96b74f4477ee4bc939\n", " 33-13172\n", - " 6935\n", - " 20\n", - " 2024-05-22 06:26:29\n", + " 4690\n", + " 21\n", + " 2024-05-22 00:24:04\n", " NaN\n", - " 2024-05-22 06:25:00\n", + " 2024-05-22 00:25:00\n", " NaN\n", " \n", " \n", @@ -41854,14 +41854,14 @@ "" ], "text/plain": [ - " schedule_gtfs_dataset_key route_id stop_id stop_sequence \\\n", - "7746 0666caf3ec1ecc96b74f4477ee4bc939 33-13172 6935 20 \n", + " schedule_gtfs_dataset_key route_id stop_id stop_sequence \\\n", + "10317 0666caf3ec1ecc96b74f4477ee4bc939 33-13172 4690 21 \n", "\n", - " converted_rt_arrival actual_arrival_lag_min converted_schd_arrival \\\n", - "7746 2024-05-22 06:26:29 NaN 2024-05-22 06:25:00 \n", + " converted_rt_arrival actual_arrival_lag_min converted_schd_arrival \\\n", + "10317 2024-05-22 00:24:04 NaN 2024-05-22 00:25:00 \n", "\n", - " scheduled_arrival_lag_min \n", - "7746 NaN " + " scheduled_arrival_lag_min \n", + "10317 NaN " ] }, "execution_count": 73, @@ -50608,23 +50608,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "test1 = compare_approaches(\n", + " stop_id=\"5685\",\n", + " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n", + " route_id=\"204-13172\",\n", + " stop_sequence=46,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "bddf4717-b901-4ee0-a9a1-46df6f9a04e3", + "metadata": {}, + "source": [ + "### Something going wrong for `all_trips` once I aggregate for all operators." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "8af00aab-c5cd-48c7-babb-1d45f2ecdc8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caltrans_districtschedule_gtfs_dataset_keyfeed_keyorganization_nameroute_long_nameroute_typeroute_iddirection_idstop_idstop_sequenceall_tripsper_trip_bunched_per_stop
4753507 - Los Angeles0666caf3ec1ecc96b74f4477ee4bc939608992664173210532aa3e6cc573be2fLos Angeles County Metropolitan Transportation AuthorityMetro Local LineBus33-131721.0031048030.000.10
\n", + "
" + ], + "text/plain": [ + " caltrans_district schedule_gtfs_dataset_key \\\n", + "47535 07 - Los Angeles 0666caf3ec1ecc96b74f4477ee4bc939 \n", + "\n", + " feed_key \\\n", + "47535 608992664173210532aa3e6cc573be2f \n", + "\n", + " organization_name \\\n", + "47535 Los Angeles County Metropolitan Transportation Authority \n", + "\n", + " route_long_name route_type route_id direction_id stop_id \\\n", + "47535 Metro Local Line Bus 33-13172 1.00 3104 \n", + "\n", + " stop_sequence all_trips per_trip_bunched_per_stop \n", + "47535 80 30.00 0.10 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
caltrans_districtschedule_gtfs_dataset_keyfeed_keyorganization_nameroute_long_nameroute_typeroute_iddirection_idstop_idstop_sequenceall_tripsper_trip_bunched_per_stop
5596507 - Los Angeles0666caf3ec1ecc96b74f4477ee4bc939608992664173210532aa3e6cc573be2fLos Angeles County Metropolitan Transportation AuthorityMetro Local LineBus33-131721.0031048053.000.51
\n", + "
" + ], + "text/plain": [ + " caltrans_district schedule_gtfs_dataset_key \\\n", + "55965 07 - Los Angeles 0666caf3ec1ecc96b74f4477ee4bc939 \n", + "\n", + " feed_key \\\n", + "55965 608992664173210532aa3e6cc573be2f \n", + "\n", + " organization_name \\\n", + "55965 Los Angeles County Metropolitan Transportation Authority \n", + "\n", + " route_long_name route_type route_id direction_id stop_id \\\n", + "55965 Metro Local Line Bus 33-13172 1.00 3104 \n", + "\n", + " stop_sequence all_trips per_trip_bunched_per_stop \n", + "55965 80 53.00 0.51 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "27" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "test3 = compare_approaches(\n", + " stop_id=\"3104\",\n", + " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n", + " route_id=\"33-13172\",\n", + " stop_sequence=80,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/gtfs_digest/43_transit_bunching.ipynb b/gtfs_digest/43_transit_bunching.ipynb new file mode 100644 index 000000000..2884d59d4 --- /dev/null +++ b/gtfs_digest/43_transit_bunching.ipynb @@ -0,0 +1,2540 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "733e5c07-e894-48de-b92a-9cba10b7fc9a", + "metadata": {}, + "source": [ + "## Transit Bunching \n", + "* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`\n", + "* [Issue](https://github.com/cal-itp/data-analyses/issues/1099)\n", + "### 11/8\n", + "* Figure out how to address City of Visalia: one of the buses that is scheduled to arrive earlier arrives later than another bus. \n", + "* This leads to a negative time stamp and makes it appear like there is a lot of bunching per the Transit Matters approach.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "60097cf1-857d-4c7f-9fc7-043a69ec1a61", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime as dt\n", + "\n", + "import altair as alt\n", + "import geopandas as gpd\n", + "import merge_data\n", + "import numpy as np\n", + "import pandas as pd\n", + "from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils\n", + "from shared_utils import catalog_utils, rt_dates, rt_utils\n", + "from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS\n", + "\n", + "# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml\n", + "GTFS_DATA_DICT = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n", + "\n", + "from segment_speed_utils.project_vars import (\n", + " COMPILED_CACHED_VIEWS,\n", + " GTFS_DATA_DICT,\n", + " PROJECT_CRS,\n", + " RT_SCHED_GCS,\n", + " SCHED_GCS,\n", + " SEGMENT_GCS,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "51a085f8-5981-4da7-904f-6348cf2e18b0", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "113bd786-8d4b-4153-939b-b419b4fa97ee", + "metadata": {}, + "outputs": [], + "source": [ + "may_date = \"2024-05-22\"" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "44a80420-dbe8-4a2b-9c47-44b32bf28e00", + "metadata": {}, + "outputs": [], + "source": [ + "drop_for_preview = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"trip_instance_key\",\n", + " \"shape_array_key\",\n", + " \"feed_key\",\n", + " \"trip_id\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "2be7e7cc-b6c5-4c87-b2d5-6cb8612e23ce", + "metadata": {}, + "source": [ + "### Grab Sample Routes" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "873bf893-40a2-4b64-84ba-68604c6df18b", + "metadata": {}, + "outputs": [], + "source": [ + "subset = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " \"route_primary_direction\",\n", + " \"service_date\",\n", + " \"frequency\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c9c35b69-d5ad-47b6-9027-33cc6631835e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'schedule_route_dir/schedule_route_direction_metrics'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "85ad8696-1b70-4996-8866-3cd2ad3d3738", + "metadata": {}, + "outputs": [], + "source": [ + "route_dir_columns = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " \"time_period\",\n", + " \"route_primary_direction\",\n", + " \"frequency\",\n", + " \"service_date\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "9d07da3c-e347-4671-925b-e8f5bc31c0fd", + "metadata": {}, + "outputs": [], + "source": [ + "route_dir = merge_data.concatenate_schedule_by_route_direction([may_date])[\n", + " route_dir_columns\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "260b6305-9e99-4870-82f7-5cae925d9a0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_idtime_periodroute_primary_directionfrequencyservice_date
0015d67d5b75b5cf2b710bbadadfb75f5170.00all_dayNorthbound0.922024-05-22
1015d67d5b75b5cf2b710bbadadfb75f5170.00offpeakNorthbound0.622024-05-22
2015d67d5b75b5cf2b710bbadadfb75f5170.00peakNorthbound1.502024-05-22
3015d67d5b75b5cf2b710bbadadfb75f5171.00all_daySouthbound0.922024-05-22
4015d67d5b75b5cf2b710bbadadfb75f5171.00offpeakSouthbound0.692024-05-22
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id time_period \\\n", + "0 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n", + "1 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 offpeak \n", + "2 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 peak \n", + "3 015d67d5b75b5cf2b710bbadadfb75f5 17 1.00 all_day \n", + "4 015d67d5b75b5cf2b710bbadadfb75f5 17 1.00 offpeak \n", + "\n", + " route_primary_direction frequency service_date \n", + "0 Northbound 0.92 2024-05-22 \n", + "1 Northbound 0.62 2024-05-22 \n", + "2 Northbound 1.50 2024-05-22 \n", + "3 Southbound 0.92 2024-05-22 \n", + "4 Southbound 0.69 2024-05-22 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "route_dir.head()" + ] + }, + { + "cell_type": "markdown", + "id": "84741559-46a2-4a62-a6e2-8843771aea1f", + "metadata": {}, + "source": [ + "#### Attach operators and districts" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d35238bb-2418-466b-8814-96382abeb3eb", + "metadata": {}, + "outputs": [], + "source": [ + "# Grab Crosswalk\n", + "CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "6272b8aa-0d76-4e18-bc76-66548f54c9a0", + "metadata": {}, + "outputs": [], + "source": [ + "crosswalk_cols = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"organization_name\",\n", + " \"name\",\n", + " \"caltrans_district\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d5d62df4-cffb-4bc8-9ad1-9766a8ec2bf1", + "metadata": {}, + "outputs": [], + "source": [ + "crosswalk_df = (\n", + " time_series_utils.concatenate_datasets_across_dates(\n", + " SCHED_GCS, CROSSWALK, [may_date], data_type=\"df\", columns=crosswalk_cols\n", + " )\n", + " .sort_values([\"service_date\"])\n", + " .reset_index(drop=True)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4666bf1d-6456-49d6-955a-5ab77556af15", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyorganization_namenamecaltrans_districtservice_date
01770249a5a2e770ca90628434d4934b1Ventura County Transportation CommissionVCTC GMV Schedule07 - Los Angeles2024-05-22
1f8102a9c0693206bf36d302540bf1bcfCity of CoronaCorona Schedule08 - San Bernardino2024-05-22
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key organization_name \\\n", + "0 1770249a5a2e770ca90628434d4934b1 Ventura County Transportation Commission \n", + "1 f8102a9c0693206bf36d302540bf1bcf City of Corona \n", + "\n", + " name caltrans_district service_date \n", + "0 VCTC GMV Schedule 07 - Los Angeles 2024-05-22 \n", + "1 Corona Schedule 08 - San Bernardino 2024-05-22 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crosswalk_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "661b3b18-5735-443d-acfc-4f598882a661", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(189, 5)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crosswalk_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d9d562b4-7ce1-4ea5-bdba-e8e0abbc3815", + "metadata": {}, + "outputs": [], + "source": [ + "routes = pd.merge(\n", + " route_dir,\n", + " crosswalk_df,\n", + " on=[\"schedule_gtfs_dataset_key\", \"service_date\"],\n", + " how=\"left\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "3e80b935-ecb4-44cd-9174-72c0d4568e14", + "metadata": {}, + "outputs": [], + "source": [ + "# routes = pd.concat([thousand_oaks, visalia, metro, metro_33])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "6f1b4c16-841d-43a1-89d6-fb4bcd394786", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['17', '219', '228', ..., '10867636', '10867637', '11096761'],\n", + " dtype=object)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "routes.route_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "796640fb-35e0-42d1-8cd0-cc10c31befcf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schedule_gtfs_dataset_keyroute_iddirection_idtime_periodroute_primary_directionfrequencyservice_dateorganization_namenamecaltrans_district
0015d67d5b75b5cf2b710bbadadfb75f5170.00all_dayNorthbound0.922024-05-22Marin County Transit DistrictBay Area 511 Marin Schedule04 - Oakland
1015d67d5b75b5cf2b710bbadadfb75f5170.00offpeakNorthbound0.622024-05-22Marin County Transit DistrictBay Area 511 Marin Schedule04 - Oakland
2015d67d5b75b5cf2b710bbadadfb75f5170.00peakNorthbound1.502024-05-22Marin County Transit DistrictBay Area 511 Marin Schedule04 - Oakland
\n", + "
" + ], + "text/plain": [ + " schedule_gtfs_dataset_key route_id direction_id time_period \\\n", + "0 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 all_day \n", + "1 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 offpeak \n", + "2 015d67d5b75b5cf2b710bbadadfb75f5 17 0.00 peak \n", + "\n", + " route_primary_direction frequency service_date \\\n", + "0 Northbound 0.92 2024-05-22 \n", + "1 Northbound 0.62 2024-05-22 \n", + "2 Northbound 1.50 2024-05-22 \n", + "\n", + " organization_name name \\\n", + "0 Marin County Transit District Bay Area 511 Marin Schedule \n", + "1 Marin County Transit District Bay Area 511 Marin Schedule \n", + "2 Marin County Transit District Bay Area 511 Marin Schedule \n", + "\n", + " caltrans_district \n", + "0 04 - Oakland \n", + "1 04 - Oakland \n", + "2 04 - Oakland " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "routes.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "9ca3a3e0-561f-4588-b71d-80abe692215a", + "metadata": {}, + "source": [ + "### Add Trips" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "674f8ee6-3f6d-4f90-90ba-7ea9ee688b28", + "metadata": {}, + "outputs": [], + "source": [ + "TABLE = GTFS_DATA_DICT.schedule_downloads.trips" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "5333afa8-4849-4864-b253-b03a1093e84f", + "metadata": {}, + "outputs": [], + "source": [ + "FILE = f\"{COMPILED_CACHED_VIEWS}{TABLE}_{may_date}.parquet\"" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "111e2c27-337d-4440-a1f6-10ec582a6f9e", + "metadata": {}, + "outputs": [], + "source": [ + "trips_subset = [\n", + " \"gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"trip_instance_key\",\n", + " \"shape_array_key\",\n", + " \"feed_key\",\n", + " \"route_long_name\",\n", + " \"direction_id\",\n", + " \"route_type\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "e1aed68a-0ed2-4da4-904a-cb91681e7f38", + "metadata": {}, + "outputs": [], + "source": [ + "trips = pd.read_parquet(FILE)[trips_subset].rename(\n", + " columns={\"gtfs_dataset_key\": \"schedule_gtfs_dataset_key\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "04901f96-76d5-4d99-b3d1-f174ef989357", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes = pd.merge(\n", + " trips,\n", + " routes,\n", + " on=[\"schedule_gtfs_dataset_key\", \"route_id\", \"direction_id\"],\n", + " how=\"inner\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "1195177f-0215-4b24-9e47-dccfb3ee542e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(392497, 15)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trips_routes.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "9c1ba23b-30df-4916-a522-eb70bd5afdb9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1338" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trips_routes.route_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "3fe48162-5b42-4bf9-8853-a9d742e9d03b", + "metadata": {}, + "source": [ + "#### I know we can get this from the warehouse but it seems cumbersome. Correct me if I'm wrong." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "d9688e03-4b61-4736-b9d5-3539b0de80b2", + "metadata": {}, + "outputs": [], + "source": [ + "# https://gtfs.org/documentation/schedule/reference/#\n", + "route_type_crosswalk = {\n", + " \"route_type\": [\"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"11\", \"12\"],\n", + " \"route_type_str\": [\n", + " \"Tram, Streetcar, Light rail\",\n", + " \"Subway, Metro\",\n", + " \"Rail\",\n", + " \"Bus\",\n", + " \"Ferry.\",\n", + " \"Cable tram.\",\n", + " \"Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway).\",\n", + " \"Funicular.\",\n", + " \"Trolleybus.\",\n", + " \"Monorail.\",\n", + " ],\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "8dac05a6-0ba9-472b-85a2-5a0081550efb", + "metadata": {}, + "outputs": [], + "source": [ + "route_type_crosswalk_df = pd.DataFrame(route_type_crosswalk)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "cd13aa3e-f222-49f8-b923-1e9e901f7bfb", + "metadata": {}, + "outputs": [], + "source": [ + "# Merge for route_type\n", + "trips_routes = pd.merge(\n", + " trips_routes, route_type_crosswalk_df, on=[\"route_type\"], how=\"left\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ae72b7fc-ec7b-4dcd-8553-ac2abce5da1d", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes = trips_routes.drop(columns=[\"route_type\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "cd31791a-fcb7-4731-90d7-bf5606dd4ce3", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes = trips_routes.rename(columns={\"route_type_str\": \"route_type\"})" + ] + }, + { + "cell_type": "markdown", + "id": "e3be8778-84ff-479f-a3b2-178e374da5f2", + "metadata": {}, + "source": [ + "### Get Stop Times " + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "3f6727c6-205f-4cc8-8a68-42e8dec6e4b3", + "metadata": {}, + "outputs": [], + "source": [ + "rt_stop_times = pd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_rt_stop_times_2024-05-22.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "041731b3-0fcd-4b3e-8d01-f84460dd5fab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trip_idstop_idstop_sequencescheduled_arrival_secschedule_gtfs_dataset_keytrip_instance_keyrt_arrival_sec
01d105244-776c-4b3f-af78-9c7ad78c21030b2443b6-b50f-452b-a749-464588ca93b8860991.001fd2f07342d966919b15d5d37fda8cc845ae17540ca9fb5030c84dbb12e48e9a61434
11d105244-776c-4b3f-af78-9c7ad78c2103cd5650b0-9a18-4e78-aedc-385f3094fa0f961179.001fd2f07342d966919b15d5d37fda8cc845ae17540ca9fb5030c84dbb12e48e9a61616
\n", + "
" + ], + "text/plain": [ + " trip_id stop_id \\\n", + "0 1d105244-776c-4b3f-af78-9c7ad78c2103 0b2443b6-b50f-452b-a749-464588ca93b8 \n", + "1 1d105244-776c-4b3f-af78-9c7ad78c2103 cd5650b0-9a18-4e78-aedc-385f3094fa0f \n", + "\n", + " stop_sequence scheduled_arrival_sec schedule_gtfs_dataset_key \\\n", + "0 8 60991.00 1fd2f07342d966919b15d5d37fda8cc8 \n", + "1 9 61179.00 1fd2f07342d966919b15d5d37fda8cc8 \n", + "\n", + " trip_instance_key rt_arrival_sec \n", + "0 45ae17540ca9fb5030c84dbb12e48e9a 61434 \n", + "1 45ae17540ca9fb5030c84dbb12e48e9a 61616 " + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rt_stop_times.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "049a2833-f132-431a-8f44-92f31cd11d8a", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times = pd.merge(\n", + " rt_stop_times,\n", + " trips_routes,\n", + " on=[\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"trip_instance_key\",\n", + " ],\n", + " how=\"inner\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "7c59d770-d379-422e-a23d-9140c23df375", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "44988" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(trips_routes_times.scheduled_arrival_sec.isna().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "4b2df823-ec98-483d-b30f-9072c62704b6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 11031910 entries, 0 to 11031909\n", + "Data columns (total 20 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 trip_id object \n", + " 1 stop_id object \n", + " 2 stop_sequence int64 \n", + " 3 scheduled_arrival_sec float64 \n", + " 4 schedule_gtfs_dataset_key object \n", + " 5 trip_instance_key object \n", + " 6 rt_arrival_sec int64 \n", + " 7 route_id object \n", + " 8 shape_array_key object \n", + " 9 feed_key object \n", + " 10 route_long_name object \n", + " 11 direction_id float64 \n", + " 12 time_period object \n", + " 13 route_primary_direction object \n", + " 14 frequency float64 \n", + " 15 service_date datetime64[ns]\n", + " 16 organization_name object \n", + " 17 name object \n", + " 18 caltrans_district object \n", + " 19 route_type object \n", + "dtypes: datetime64[ns](1), float64(3), int64(2), object(14)\n", + "memory usage: 1.7+ GB\n" + ] + } + ], + "source": [ + "trips_routes_times.info()" + ] + }, + { + "cell_type": "markdown", + "id": "2d09574e-464e-4f31-8f53-0596911dcabe", + "metadata": {}, + "source": [ + "### Sorting " + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "0ef36e93-79ed-4f86-b16a-9d28d90aea1a", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2 = trips_routes_times.sort_values(\n", + " by=[\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_long_name\",\n", + " \"shape_array_key\",\n", + " \"direction_id\",\n", + " \"stop_sequence\",\n", + " \"stop_id\",\n", + " \"rt_arrival_sec\",\n", + " ]\n", + ").reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "976db21c-45da-4773-8285-fe4a33e14152", + "metadata": {}, + "outputs": [], + "source": [ + "preview_sort_col = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " \"stop_sequence\",\n", + " \"rt_arrival_sec\",\n", + " \"stop_id\",\n", + " \"scheduled_arrival_sec\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "b0bbe80c-75bf-4d3e-b5e7-891791114291", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(21009, 7)" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trips_routes_times2.loc[\n", + " (\n", + " trips_routes_times2.schedule_gtfs_dataset_key\n", + " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n", + " )\n", + " & (trips_routes_times2.route_id == \"204-13172\")\n", + " & (trips_routes_times2.direction_id == 1)\n", + "][preview_sort_col].shape" + ] + }, + { + "cell_type": "markdown", + "id": "48737ec7-31be-4743-97ca-c6c000670a13", + "metadata": { + "tags": [] + }, + "source": [ + "### Convert scheduled and RT arrival times.\n", + "* If 82800 < `scheduled_arrival_time` < 86_400 but `rt_arrival_sec` is lower say 14_000 (4 am in the morning): then the bus was scheduled to arrive on May 21 (day before the service date) but it arrived a little later on the actual service date we query the data for. \n", + "* If 86_400 < `scheduled_arrival_time` and `rt_arrival_sec` is around 86_000 then this is the same service date. " + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "d4e5a2df-a110-4347-9c6d-7f9bdfdbce1b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 11031910.00\n", + "mean 48176.39\n", + "std 17806.20\n", + "min 0.00\n", + "25% 33526.00\n", + "50% 48304.00\n", + "75% 62400.00\n", + "max 86399.00\n", + "Name: rt_arrival_sec, dtype: float64" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trips_routes_times2[\"rt_arrival_sec\"].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "97e3dc65-45fc-4318-a5f6-b0f497a6ab04", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1024, 20)" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trips_routes_times2.loc[trips_routes_times2[\"scheduled_arrival_sec\"] == 86_400].shape" + ] + }, + { + "cell_type": "markdown", + "id": "630e4f7c-73c5-4f90-9da8-bf50378f3996", + "metadata": {}, + "source": [ + "#### Filter out for trips with `scheduled_arrival_sec` that's over 24 hours." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "099c6838-aa42-4eb5-a250-0ac3e054f834", + "metadata": {}, + "outputs": [], + "source": [ + "timestamp_subset = [\n", + " \"converted_schd_arrival\",\n", + " \"converted_rt_arrival\",\n", + " \"scheduled_arrival_sec\",\n", + " \"rt_arrival_sec\",\n", + " \"service_date\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "d64637d8-ca0d-4e3c-b956-13d8e64e513c", + "metadata": {}, + "outputs": [], + "source": [ + "def adjust_sched_arrival_seconds(sched_arrival_seconds, date, rt_arrival_sec):\n", + " \"\"\"\n", + " Adjusts days and time of sched_arrival_seconds because it runs over 24 hours\n", + " based on a combination of sched_arrival_seconds and rt_arrival_sec criteria.\n", + "\n", + " Parameters:\n", + " sched_arrival_seconds (int): Number of seconds.\n", + " date (datetime): Initial date.\n", + " rt_arrival_sec (int): Arrival time in seconds.\n", + "\n", + " Returns:\n", + " datetime: Adjusted date and time for sched_arrival_seconds\n", + " \"\"\"\n", + " # If the rt arrival second is between 12-1AM and the scheduled arrival time is between\n", + " # is between 11pm and 1am subtract a day\n", + " if rt_arrival_sec < (60 * 60) and (82_800 < sched_arrival_seconds < 90_000):\n", + " return pd.Timestamp(date + pd.Timedelta(days=-1)) + pd.Timedelta(\n", + " seconds=sched_arrival_seconds % 86400\n", + " )\n", + " else:\n", + " # No change\n", + " return pd.Timestamp(date) + pd.Timedelta(seconds=sched_arrival_seconds)" + ] + }, + { + "cell_type": "markdown", + "id": "75bfaa72-2fbc-4b88-9751-10744cdfc193", + "metadata": {}, + "source": [ + "#### Subset to make the df smaller\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "2f0efe6f-81ac-4382-9f9f-5455736a3026", + "metadata": {}, + "outputs": [], + "source": [ + "subset = [\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " \"scheduled_arrival_sec\",\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"trip_instance_key\",\n", + " \"rt_arrival_sec\",\n", + " \"route_id\",\n", + " \"shape_array_key\",\n", + " \"route_long_name\",\n", + " \"direction_id\",\n", + " \"organization_name\",\n", + " \"caltrans_district\",\n", + " \"service_date\",\n", + " \"route_type\",\n", + " \"feed_key\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "bea6dc67-8a1c-484a-b141-43f29af83df9", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2 = trips_routes_times2[subset]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "41f6aded-7f9c-4f5e-ba7a-2a52769a3e44", + "metadata": {}, + "outputs": [], + "source": [ + "#trips_routes_times2[\"converted_schd_arrival2\"] = trips_routes_times2.apply(\n", + " # lambda row: adjust_sched_arrival_seconds(\n", + " # row[\"scheduled_arrival_sec\"], row[\"service_date\"], row[\"rt_arrival_sec\"]\n", + " # ),\n", + "# axis=1,\n", + "#)#" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "3ce579d1-0392-41a6-b535-9f4422a78216", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\"converted_rt_arrival\"] = pd.to_datetime(\n", + " trips_routes_times2[\"service_date\"]\n", + ") + pd.to_timedelta(trips_routes_times2[\"rt_arrival_sec\"] % 86400, unit=\"s\")" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "1d3e4966-1f57-4117-9f73-506055a42d7b", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\"converted_schd_arrival\"] = pd.to_datetime(\n", + " trips_routes_times2[\"service_date\"]\n", + ") + pd.to_timedelta(trips_routes_times2[\"scheduled_arrival_sec\"] % 86400, unit=\"s\")" + ] + }, + { + "cell_type": "markdown", + "id": "b44fa6d1-edf4-42be-af6f-3bc777b091c1", + "metadata": {}, + "source": [ + "#### Checkout results from the code\n", + "* Make sure it makes sense.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "60d18d14-1f6b-4b97-a736-cb295b3a6382", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
scheduled_arrival_secconverted_schd_arrivalrt_arrival_secconverted_rt_arrival
157540287180.002024-05-22 00:13:008462024-05-22 00:14:06
57307491800.002024-05-22 01:30:0053962024-05-22 01:29:56
4038881103973.002024-05-22 04:52:53176252024-05-22 04:53:45
1823671102300.002024-05-22 04:25:00157832024-05-22 04:23:03
431234986691.002024-05-22 00:04:512092024-05-22 00:03:29
57174989160.002024-05-22 00:46:0026892024-05-22 00:44:49
408319198040.002024-05-22 03:14:00116992024-05-22 03:14:59
404676787111.002024-05-22 00:11:515952024-05-22 00:09:55
739711286820.002024-05-22 00:07:003972024-05-22 00:06:37
77282689160.002024-05-22 00:46:0041802024-05-22 01:09:40
189101586760.002024-05-22 00:06:009102024-05-22 00:15:10
37026196960.002024-05-22 02:56:00101922024-05-22 02:49:52
482931587035.002024-05-22 00:10:356342024-05-22 00:10:34
126478895340.002024-05-22 02:29:0090332024-05-22 02:30:33
4415882100280.002024-05-22 03:51:20141382024-05-22 03:55:38
373870586938.002024-05-22 00:08:586532024-05-22 00:10:53
299894089100.002024-05-22 00:45:0030142024-05-22 00:50:14
207613998100.002024-05-22 03:15:00119522024-05-22 03:19:12
394444996840.002024-05-22 02:54:00108732024-05-22 03:01:13
4082972106965.002024-05-22 05:42:45203782024-05-22 05:39:38
\n", + "
" + ], + "text/plain": [ + " scheduled_arrival_sec converted_schd_arrival rt_arrival_sec \\\n", + "1575402 87180.00 2024-05-22 00:13:00 846 \n", + "573074 91800.00 2024-05-22 01:30:00 5396 \n", + "4038881 103973.00 2024-05-22 04:52:53 17625 \n", + "1823671 102300.00 2024-05-22 04:25:00 15783 \n", + "4312349 86691.00 2024-05-22 00:04:51 209 \n", + "571749 89160.00 2024-05-22 00:46:00 2689 \n", + "4083191 98040.00 2024-05-22 03:14:00 11699 \n", + "4046767 87111.00 2024-05-22 00:11:51 595 \n", + "7397112 86820.00 2024-05-22 00:07:00 397 \n", + "772826 89160.00 2024-05-22 00:46:00 4180 \n", + "1891015 86760.00 2024-05-22 00:06:00 910 \n", + "370261 96960.00 2024-05-22 02:56:00 10192 \n", + "4829315 87035.00 2024-05-22 00:10:35 634 \n", + "1264788 95340.00 2024-05-22 02:29:00 9033 \n", + "4415882 100280.00 2024-05-22 03:51:20 14138 \n", + "3738705 86938.00 2024-05-22 00:08:58 653 \n", + "2998940 89100.00 2024-05-22 00:45:00 3014 \n", + "2076139 98100.00 2024-05-22 03:15:00 11952 \n", + "3944449 96840.00 2024-05-22 02:54:00 10873 \n", + "4082972 106965.00 2024-05-22 05:42:45 20378 \n", + "\n", + " converted_rt_arrival \n", + "1575402 2024-05-22 00:14:06 \n", + "573074 2024-05-22 01:29:56 \n", + "4038881 2024-05-22 04:53:45 \n", + "1823671 2024-05-22 04:23:03 \n", + "4312349 2024-05-22 00:03:29 \n", + "571749 2024-05-22 00:44:49 \n", + "4083191 2024-05-22 03:14:59 \n", + "4046767 2024-05-22 00:09:55 \n", + "7397112 2024-05-22 00:06:37 \n", + "772826 2024-05-22 01:09:40 \n", + "1891015 2024-05-22 00:15:10 \n", + "370261 2024-05-22 02:49:52 \n", + "4829315 2024-05-22 00:10:34 \n", + "1264788 2024-05-22 02:30:33 \n", + "4415882 2024-05-22 03:55:38 \n", + "3738705 2024-05-22 00:10:53 \n", + "2998940 2024-05-22 00:50:14 \n", + "2076139 2024-05-22 03:19:12 \n", + "3944449 2024-05-22 03:01:13 \n", + "4082972 2024-05-22 05:39:38 " + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trips_routes_times2.loc[trips_routes_times2[\"scheduled_arrival_sec\"] > 86_400][\n", + " [\n", + " \"scheduled_arrival_sec\",\n", + " \"converted_schd_arrival\",\n", + " \"rt_arrival_sec\",\n", + " \"converted_rt_arrival\",\n", + " ]\n", + "].sample(20)" + ] + }, + { + "cell_type": "markdown", + "id": "31b10b40-73bd-4b02-92b7-bf0cacaf67b9", + "metadata": {}, + "source": [ + "### Deal with delays\n", + "* Some very extreme values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8325eed2-412b-4202-9d57-252db2fd7e26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 10986922.00\n", + "mean 1.90\n", + "std 35.79\n", + "min -1439.78\n", + "25% -0.23\n", + "50% 1.45\n", + "75% 4.05\n", + "max 1439.98\n", + "Name: delay_min, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "more_than_86400 = deal_with_23_hours(trips_routes_times2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "003abe45-5d55-4839-80f8-83e693214427", + "metadata": {}, + "outputs": [], + "source": [ + "percentiles = [0.01, 0.02, 0.05, 0.1, 0.9, 0.95, 0.98, 0.99]" + ] + }, + { + "cell_type": "markdown", + "id": "13d8ff3a-1ed5-46bb-af2e-86eaa7616046", + "metadata": {}, + "source": [ + "### Help, can't fix everythin...how to address these edge cases? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f0e5d8a-5e26-4c75-b593-2cc9aa275a35", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2.converted_schd_arrival.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fefa0cfc-79af-4538-b613-bf4669daecd0", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\n", + " trips_routes_times2[\"converted_schd_arrival\"].dt.strftime(\"%Y-%m-%d\")\n", + " == \"2024-05-21\"\n", + "].head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1844a0a-26e0-473f-88a0-3c9bd6748fac", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\n", + " trips_routes_times2[\"converted_schd_arrival\"].dt.strftime(\"%Y-%m-%d\")\n", + " == \"2024-05-23\"\n", + "].head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84a4772d-653b-49dd-8db2-33e67f3c0708", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2.converted_rt_arrival.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "039bd4bb-999d-405b-85a3-8b6e2f41c454", + "metadata": {}, + "source": [ + "#### Question: Last time, I received the suggestion to throw away things more than 2 hrs because that is not bunching. But wouldn't this be misleading as a metric when we want to calculate all the rows?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a16335dc-47a0-4ea1-9612-fc6d5dfea1d6", + "metadata": {}, + "outputs": [], + "source": [ + "print(trips_routes_times2.delay_min.describe(percentiles))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "076d1774-5ddb-43e9-99f8-729deec0357c", + "metadata": {}, + "outputs": [], + "source": [ + "# trips_routes_times2 = trips_routes_times2.loc[\n", + "# trips_routes_times2.delay_min < 120\n", + "# ].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4029214f-47c8-4a4e-917b-dd5e70bbae25", + "metadata": {}, + "outputs": [], + "source": [ + "# trips_routes_times2 = trips_routes_times2.loc[\n", + "# trips_routes_times2.delay_min > -120\n", + "# ].reset_index(drop=True)" + ] + }, + { + "cell_type": "markdown", + "id": "81bd753a-08ee-4d09-ac79-213e1e605405", + "metadata": {}, + "source": [ + "### Calculate the actual & scheduled headway the `operator-route-direction_id-stop_sequence-stop_id-` grain\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9171520-7358-4f22-9d9c-c5156e710f1b", + "metadata": {}, + "outputs": [], + "source": [ + "groupby_cols = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_long_name\",\n", + " \"direction_id\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1ab53c4-bd2d-44cc-8e4d-db7801cb5a3b", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\"actual_arrival_lag_min\"] = (\n", + " trips_routes_times2.groupby(groupby_cols)[\"converted_rt_arrival\"]\n", + " .diff()\n", + " .dt.total_seconds()\n", + " / 60\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6328c5fb-205e-4a78-a3d3-62fafa88a4cd", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\"scheduled_arrival_lag_min\"] = (\n", + " trips_routes_times2.groupby(groupby_cols)[\"converted_schd_arrival\"]\n", + " .diff()\n", + " .dt.total_seconds()\n", + " / 60\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fda264c-9496-4164-95d1-f7c6e9fb1784", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\"scheduled_arrival_lag_min\"].describe(percentiles)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "875b27f9-5cd5-496a-a9e3-df7f609e4dd7", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\"actual_arrival_lag_min\"].describe(percentiles)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0370dc8b-8a4d-4e16-8943-a559aa0aac07", + "metadata": {}, + "outputs": [], + "source": [ + "preview_time_col = [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"route_id\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " \"converted_rt_arrival\",\n", + " \"actual_arrival_lag_min\",\n", + " \"converted_schd_arrival\",\n", + " \"scheduled_arrival_lag_min\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ba17b18-b9cb-4580-8eeb-47d9d058e9f1", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2.loc[\n", + " (\n", + " trips_routes_times2.schedule_gtfs_dataset_key\n", + " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n", + " )\n", + " & (trips_routes_times2.route_id == \"204-13172\")\n", + " & (trips_routes_times2.stop_id == \"3961\")\n", + "][preview_time_col]" + ] + }, + { + "cell_type": "markdown", + "id": "8a5f6f72-d463-4076-80ce-d22ab1f718b1", + "metadata": {}, + "source": [ + "### Many lags are actually empty b/c it's the first of that groupby-sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98df5908-9530-43c8-ab6c-9283f9cc78d0", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "trips_routes_times2[trips_routes_times2[\"scheduled_arrival_lag_min\"].isna()][\n", + " preview_time_col\n", + "].sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b58b89fc-f675-4538-b456-d91431e30229", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "trips_routes_times2.loc[\n", + " (\n", + " trips_routes_times2.schedule_gtfs_dataset_key\n", + " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n", + " )\n", + " & (trips_routes_times2.route_id == \"204-13172\")\n", + " & (trips_routes_times2.stop_sequence == 2)\n", + "][preview_time_col]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22564ce6-554d-4a2d-81cc-5b98018116b4", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "trips_routes_times2.loc[\n", + " (\n", + " trips_routes_times2.schedule_gtfs_dataset_key\n", + " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n", + " )\n", + " & (trips_routes_times2.route_id == \"204-13172\")\n", + " & (trips_routes_times2.stop_sequence == 59)\n", + "][preview_time_col]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "908b3be6-72b5-483e-8be5-a2932c59cdd4", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "trips_routes_times2.loc[\n", + " (\n", + " trips_routes_times2.schedule_gtfs_dataset_key\n", + " == \"0666caf3ec1ecc96b74f4477ee4bc939\"\n", + " )\n", + " & (trips_routes_times2.route_id == \"204-13172\")\n", + " & (trips_routes_times2.stop_sequence == 46)\n", + " & (trips_routes_times2.stop_id == \"5685\")\n", + "][preview_time_col]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1997e407-9483-4142-95db-2a1892fbace7", + "metadata": {}, + "outputs": [], + "source": [ + "# rt_stop_times4 = rt_stop_times4.fillna(0)" + ] + }, + { + "cell_type": "markdown", + "id": "28362518-a54b-4f5d-a4d7-24a3d8ddefd0", + "metadata": {}, + "source": [ + "### Transit Matters Method" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0f0f4b7-fa64-4b01-a141-5dd78c59693b", + "metadata": {}, + "outputs": [], + "source": [ + "transit_matters_df1 = trips_routes_times2.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bea63e5-45d0-4d06-8c1c-fd34a69ffde7", + "metadata": {}, + "outputs": [], + "source": [ + "transit_matters_df1[\"pct_actual_schd_headway\"] = (\n", + " transit_matters_df1.actual_arrival_lag_min\n", + " / transit_matters_df1.scheduled_arrival_lag_min\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c67c6299-68f0-414f-a9c1-e0b27511b9e5", + "metadata": {}, + "outputs": [], + "source": [ + "transit_matters_df1[\"bunched_y_n\"] = np.where(\n", + " transit_matters_df1[\"pct_actual_schd_headway\"] < 0.25, \"bunched\", \"not bunched\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7d56ce7-66e9-4084-a725-a9eff7c4c5b2", + "metadata": {}, + "outputs": [], + "source": [ + "transit_matters_df1.bunched_y_n.value_counts() / len(transit_matters_df1)" + ] + }, + { + "cell_type": "markdown", + "id": "db10254b-d5d4-4619-9c6e-10fab19ec6b2", + "metadata": {}, + "source": [ + "#### Aggregate.\n", + "* At this point, it doesn't matter the sequence, we just care about how bunched the traffic is around one partiuclar stop. \n", + "* See how many trips for that grain are considered \"bunched\" or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30bbbbad-4587-43e3-b5c7-632079f5a588", + "metadata": {}, + "outputs": [], + "source": [ + "def bunched_not_bunched(\n", + " df: pd.DataFrame, bunched_y_n: str, groupby_cols: list\n", + ") -> pd.DataFrame:\n", + " df2 = df.loc[df.bunched_y_n == bunched_y_n].reset_index(drop=True)\n", + "\n", + " bunched_y_n = bunched_y_n.replace(\" \", \"_\")\n", + " agg1 = (\n", + " df2.groupby(groupby_cols).agg({\"trip_instance_key\": \"nunique\"}).reset_index()\n", + " ).rename(columns={\"trip_instance_key\": f\"{bunched_y_n}_trips\"})\n", + " return agg1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb7f41ff-ff97-4a00-b4f8-b9cdab73da64", + "metadata": {}, + "outputs": [], + "source": [ + "def agg_final_df(df: pd.DataFrame) -> pd.DataFrame:\n", + " groupby_cols = [\n", + " \"caltrans_district\",\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"feed_key\",\n", + " \"organization_name\",\n", + " \"route_long_name\",\n", + " \"route_type\",\n", + " \"route_id\",\n", + " \"direction_id\",\n", + " \"stop_id\",\n", + " \"stop_sequence\",\n", + " ]\n", + "\n", + " # Find total trips that are bunched\n", + " bunched = bunched_not_bunched(df, \"bunched\", groupby_cols)\n", + "\n", + " # Find total trips that are NOT bunched\n", + " not_bunched = bunched_not_bunched(df, \"not bunched\", groupby_cols)\n", + "\n", + " # Merge\n", + " m1 = pd.merge(not_bunched, bunched, on=groupby_cols, how=\"outer\")\n", + "\n", + " # Find the % of bunched trips\n", + " m1 = m1.fillna(0)\n", + " m1[\"all_trips\"] = m1.not_bunched_trips + m1.bunched_trips\n", + " m1[\"per_trip_bunched_per_stop\"] = m1.bunched_trips / m1.all_trips\n", + "\n", + " # Filter out any rows with only one trip of that groupby combo\n", + " # for that service date\n", + " m1 = m1.loc[m1.all_trips > 1].reset_index(drop=True)\n", + " m1 = m1.drop(columns=[\"not_bunched_trips\", \"bunched_trips\"])\n", + "\n", + " return m1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a33d4b24-6737-4c2a-8e38-eec929242b37", + "metadata": {}, + "outputs": [], + "source": [ + "transit_matters_m1 = agg_final_df(transit_matters_df1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a919bad-bb6f-4247-840d-e6f00a349b06", + "metadata": {}, + "outputs": [], + "source": [ + "# transit_matters_m1 = (\n", + "# transit_matters_m1.sort_values(by=[\"all_trips\"], ascending=False)\n", + "# .drop_duplicates(subset=transit_matters_agg)\n", + "# .reset_index(drop=True)\n", + "# )" + ] + }, + { + "cell_type": "markdown", + "id": "710770ad-2d1a-4636-9667-02d320b689e8", + "metadata": {}, + "source": [ + "### Help: Swapped order of a bus is messing with the transit matters metric.\n", + "* How to solve for this?? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56e4a936-215b-4bdd-8f45-e86b613198bc", + "metadata": {}, + "outputs": [], + "source": [ + "preview_cols = [\n", + " \"converted_rt_arrival\",\n", + " \"actual_arrival_lag_min\",\n", + " \"converted_schd_arrival\",\n", + " \"scheduled_arrival_lag_min\",\n", + " \"pct_actual_schd_headway\",\n", + " \"bunched_y_n\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e24cb76-4f68-4377-a0df-cedb21904b6e", + "metadata": {}, + "outputs": [], + "source": [ + "example2 = transit_matters_df1.loc[\n", + " (transit_matters_df1.stop_id == \"2307719\")\n", + " & (transit_matters_df1.organization_name == \"City of Visalia\")\n", + " & (transit_matters_df1.route_id == \"2042\")\n", + " & (transit_matters_df1.shape_array_key == \"60da59c7000ea5dcb5f845d8fa227f14\")\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "67169748-2cba-4591-8cd5-3cc8e4e3a556", + "metadata": {}, + "source": [ + "#### Starting row 33484: the RT Arrival time is swapped. A bus that was scheduled to arrive at 4:27 arrived boefre the bus arrived at 3:42.\n", + "* This repeats again row 33486." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b34f0a6-6a2b-4e56-b0fe-bd0f57b53db8", + "metadata": {}, + "outputs": [], + "source": [ + "example2[preview_cols]" + ] + }, + { + "cell_type": "markdown", + "id": "b0579e78-2a95-4d8b-9761-2824aa39a8eb", + "metadata": {}, + "source": [ + "### Use 2 minute benchmark\n", + "* [Source](https://static1.squarespace.com/static/533b9a24e4b01d79d0ae4376/t/645e82de1f570b31497c44dc/1683915486889/TransitMatters-Headwaymanagement.pdf)\n", + "* Justifying the use of\n", + "headway maintenance. For example, in April\n", + "2022 the 66 bus significantly bunched around\n", + "several stops. When bunching is defined as\n", + "buses that run within two minutes or less of\n", + "each other, inbound buses towards Nubian\n", + "Square bunched 10% of the time at Brigham\n", + "Circle, 9% at Brookline Village and Roxbury\n", + "Crossing, and 8% of the time at Coolidge\n", + "Corner. Bunching is even more dramatic\n", + "outbound towards Harvard Square where\n", + "buses bunched over 35% of the time at Winship\n", + "St, 13% at Coolidge Corner and Harvard Ave at\n", + "Commonwealth Ave, and 12% at North Harvard\n", + "St at Western Ave. View more data about bus\n", + "bunching through the TransitMatters Data\n", + "Dashboard here.\n", + "\n", + "* To Do: add back in route & operator information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0706e7e-0d56-43b2-bf3c-4205e9277c64", + "metadata": {}, + "outputs": [], + "source": [ + "two_minutes_df = trips_routes_times2.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2def9283-d995-4001-b412-0fa03a855cd5", + "metadata": {}, + "outputs": [], + "source": [ + "two_minutes_df[\"bunched_y_n\"] = np.where(\n", + " two_minutes_df[\"actual_arrival_lag_min\"] <= 2, \"bunched\", \"not bunched\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcd302cb-5f30-4318-8b28-cb29f6c376cd", + "metadata": {}, + "outputs": [], + "source": [ + "two_minutes_df.bunched_y_n.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99de2fd0-2aed-4793-a33b-56d7ffc313bc", + "metadata": {}, + "outputs": [], + "source": [ + "final_two_minute = agg_final_df(two_minutes_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "416da93d-cd6d-4ad1-bf5e-98b20c188661", + "metadata": {}, + "outputs": [], + "source": [ + "final_two_minute.loc[\n", + " (final_two_minute.stop_id == \"2307695\")\n", + " & (final_two_minute.organization_name == \"City of Visalia\")\n", + " & (final_two_minute.route_id == \"2042\")\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "1a0b54b9-3d4c-4cc4-8243-e26a42c47e83", + "metadata": {}, + "source": [ + "### Comparing both outcomes\n", + "* There are so many more bunched trips for the 2 minute approach.\n", + "* Add back in schedule_gtfs_key and then grab stop level data from the warehouse." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a11e9bc2-70c6-488a-aa7e-f92d8b53c8e0", + "metadata": {}, + "outputs": [], + "source": [ + "final_two_minute.per_trip_bunched_per_stop.describe(percentiles)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19c741e4-6476-416b-a62d-51028c6eef68", + "metadata": {}, + "outputs": [], + "source": [ + "transit_matters_m1.per_trip_bunched_per_stop.describe(percentiles)" + ] + }, + { + "cell_type": "markdown", + "id": "a6304f84-80fb-4696-a312-d28545073b22", + "metadata": {}, + "source": [ + "### Make Visuals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "220981aa-623c-4156-8445-f349f0d98d45", + "metadata": {}, + "outputs": [], + "source": [ + "freq_range = [\n", + " \"#ccbb44\",\n", + " \"#e9d868\",\n", + " \"#fcb40e\",\n", + " \"#ff9c42\",\n", + " \"#fc5c04\",\n", + " \"#dd217d\",\n", + " \"#dd217d\",\n", + " \"#dd217d\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "949a750a-6e07-4c98-8020-7b201b9bd0fa", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a5ea9a8-1158-4681-9d97-37a131b2dff4", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2[\"hour\"] = trips_routes_times2[\"converted_rt_arrival\"].dt.hour\n", + "trips_routes_times2[\"min\"] = trips_routes_times2[\"converted_rt_arrival\"].dt.minute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "562aa5ef-6dac-443e-b646-88b72174d645", + "metadata": {}, + "outputs": [], + "source": [ + "trips_routes_times2.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec812e6c-3dc7-4e21-9478-1eae1a539ec2", + "metadata": {}, + "outputs": [], + "source": [ + "def compare_approaches(\n", + " stop_id: str, organization_name: str, route_id: str, stop_sequence: int\n", + "):\n", + " transit_matter = transit_matters_m1.loc[\n", + " (transit_matters_m1.stop_id == stop_id)\n", + " & (transit_matters_m1.organization_name == organization_name)\n", + " & (transit_matters_m1.route_id == route_id)\n", + " & (transit_matters_m1.stop_sequence == stop_sequence)\n", + " ]\n", + " display(transit_matter)\n", + "\n", + " two_min = final_two_minute.loc[\n", + " (final_two_minute.stop_id == stop_id)\n", + " & (final_two_minute.organization_name == organization_name)\n", + " & (final_two_minute.route_id == route_id)\n", + " & (final_two_minute.stop_sequence == stop_sequence)\n", + " ]\n", + "\n", + " display(two_min)\n", + " total_trips = trips_routes_times2.loc[\n", + " (trips_routes_times2.stop_id == stop_id)\n", + " & (trips_routes_times2.organization_name == organization_name)\n", + " & (trips_routes_times2.route_id == route_id)\n", + " & (trips_routes_times2.stop_sequence == stop_sequence)\n", + " ]\n", + "\n", + " display(total_trips.trip_instance_key.nunique())\n", + "\n", + " chart = (\n", + " alt.Chart(total_trips)\n", + " .mark_circle(size=500)\n", + " .encode(\n", + " x=\"hour\",\n", + " y=\"min\",\n", + " color=alt.Color(\n", + " \"hour\",\n", + " scale=alt.Scale(range=freq_range),\n", + " ),\n", + " tooltip=[\"hour\", \"min\", \"actual_arrival_lag_min\"],\n", + " )\n", + " .properties(width=800, height=400)\n", + " )\n", + " display(chart)\n", + " return total_trips" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b9efd9b-6799-40a7-baff-3a936790ddd3", + "metadata": {}, + "outputs": [], + "source": [ + "test1 = compare_approaches(\n", + " stop_id=\"5685\",\n", + " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n", + " route_id=\"204-13172\",\n", + " stop_sequence=46,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6486f4bf-890d-45ae-b6dc-8c938171a466", + "metadata": {}, + "outputs": [], + "source": [ + "test2 = compare_approaches(\n", + " stop_id=\"2307469\",\n", + " organization_name=\"City of Visalia\",\n", + " route_id=\"2042\",\n", + " stop_sequence=27,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85a9a49c-0d2e-4d1a-9b7d-601880d45d5d", + "metadata": {}, + "outputs": [], + "source": [ + "test3 = compare_approaches(\n", + " stop_id=\"3104\",\n", + " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n", + " route_id=\"33-13172\",\n", + " stop_sequence=80,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "713f259d-125b-473d-abff-877f3e2d4973", + "metadata": {}, + "outputs": [], + "source": [ + "test4 = compare_approaches(\n", + " stop_id=\"15320\",\n", + " organization_name=\"Los Angeles County Metropolitan Transportation Authority\",\n", + " route_id=\"33-13172\",\n", + " stop_sequence=64,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b63c81fa-efdc-464b-a567-23ca9c194321", + "metadata": {}, + "outputs": [], + "source": [ + "test5 = compare_approaches(\n", + " stop_id=\"3288014\",\n", + " organization_name=\"City of Thousand Oaks\",\n", + " route_id=\"3402\",\n", + " stop_sequence=16,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6be5a05-07e3-4a11-a5cd-844380717548", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "transit_matters_m1.sort_values(by=[\"per_trip_bunched_per_stop\"], ascending=False)" + ] + }, + { + "cell_type": "markdown", + "id": "34ba711c-f004-4a62-81ef-0effbf5401ed", + "metadata": { + "tags": [] + }, + "source": [ + "### Make Maps \n", + "* Think I actually need the vehicle positions since stops are literally the stop's geometry, so it'll always be plotting on the same spot.\n", + "\n", + "* https://github.com/cal-itp/data-analyses/blob/db19b70329f1e817236bda13707dd903c24abb4c/_shared_utils/shared_utils/gtfs_utils_v2.py#L371\n", + "* https://github.com/cal-itp/data-analyses/blob/main/gtfs_funnel/download_stops.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef8e36d2-2209-4a99-95cf-11ca0371d93c", + "metadata": {}, + "outputs": [], + "source": [ + "stop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c323fdc-14bd-442f-b3d0-5b5fa20a2e48", + "metadata": {}, + "outputs": [], + "source": [ + "# What is this file?\n", + "vps_gdf = gpd.read_parquet(\n", + " \"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_2024-05-22.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8b2d27e-0a1d-4d6d-a5e8-da7e915b4a24", + "metadata": {}, + "outputs": [], + "source": [ + "vps_gdf = vps_gdf[\n", + " [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"trip_instance_key\",\n", + " \"location_timestamp_local\",\n", + " \"geometry\",\n", + " ]\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe5770c0-07e2-4cd0-905c-6b5a28f9a1ba", + "metadata": {}, + "outputs": [], + "source": [ + "vps_df = vps_gdf[\n", + " [\n", + " \"schedule_gtfs_dataset_key\",\n", + " \"trip_instance_key\",\n", + " \"location_timestamp_local\",\n", + " ]\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6016595b-b80d-4fef-938b-5d75e35c8a8d", + "metadata": {}, + "outputs": [], + "source": [ + "vps_m1 = pd.merge(\n", + " vps_df,\n", + " trips_routes_times2,\n", + " on=[\"schedule_gtfs_dataset_key\", \"trip_instance_key\"],\n", + " how=\"inner\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03064c85-8e73-4370-8c00-a0f34fa76029", + "metadata": {}, + "outputs": [], + "source": [ + "def one_stop(df: pd.DataFrame, stop_id: str, org_name: str, route_id: str):\n", + " # Look at one route & stop\n", + " test_route1 = df.loc[\n", + " (df.organization_name == org_name)\n", + " & (df.route_id == route_id)\n", + " & (df.stop_id == stop_id)\n", + " ]\n", + "\n", + " compare_approaches(stop_id=stop_id, organization_name=org_name, route_id=route_id)\n", + "\n", + " # display(test_route1.explore(\"time_int\", marker_kwds = {'radius':20}))\n", + " return test_route1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c513281-039e-45a5-98d0-9220e132f92a", + "metadata": {}, + "outputs": [], + "source": [ + "metro_test1 = one_stop(\n", + " vps_m1,\n", + " stop_id=\"5700\",\n", + " org_name=\"Los Angeles County Metropolitan Transportation Authority\",\n", + " route_id=\"204-13172\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a8d5439-0529-4adf-b71f-6bcfa6353392", + "metadata": {}, + "outputs": [], + "source": [ + "metro_test1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ead8a95-85fd-454d-b7ce-d5a85d6406d5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0baa05d0-40fe-4406-a80e-5acd80a8edae", + "metadata": {}, + "outputs": [], + "source": [ + "metro_test1.sample(1)" + ] + }, + { + "cell_type": "markdown", + "id": "76a7e40c-c3d8-476c-b5e1-f38f7cc12345", + "metadata": {}, + "source": [ + "### Other\n", + "* https://www.sciencedirect.com/science/article/pii/S1366554523003666\n", + "* https://www.sciencedirect.com/science/article/pii/S0968090X22002492?ref=pdf_download&fr=RR-2&rr=8d7d6fb73d8015be" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}